diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge index f32eb7213b9402..7b1b4b8dac8a20 100755 --- a/.ci/generate-buildkite-pipeline-premerge +++ b/.ci/generate-buildkite-pipeline-premerge @@ -22,19 +22,20 @@ set -o pipefail # Environment variables script works with: -# Fetch origin/main to have an up to date merge base for main...HEAD diff. -git fetch origin main:main +# Set by buildkite +: ${BUILDKITE_PULL_REQUEST_BASE_BRANCH:=} +: ${BUILDKITE_COMMIT:=} +: ${BUILDKITE_BRANCH:=} +# Fetch origin to have an up to date merge base for the diff. +git fetch origin # List of files affected by this commit -: ${MODIFIED_FILES:=$(git diff --name-only main...HEAD)} +: ${MODIFIED_FILES:=$(git diff --name-only origin/${BUILDKITE_PULL_REQUEST_BASE_BRANCH}...HEAD)} # Filter rules for generic windows tests : ${WINDOWS_AGENTS:='{"queue": "windows"}'} # Filter rules for generic linux tests : ${LINUX_AGENTS:='{"queue": "linux"}'} # Service agents, for interacting with Phabricator. : ${SERVICE_AGENTS:='{"queue": "service"}'} -# Set by buildkite -: ${BUILDKITE_COMMIT:=} -: ${BUILDKITE_BRANCH:=} reviewID="$(git log --format=%B -n 1 | sed -nE 's/^Review-ID:[[:space:]]*(.+)$/\1/p')" if [[ "${reviewID}" != "" ]]; then diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml index 29d050db2f12c0..23192f776a985e 100644 --- a/.github/workflows/libclc-tests.yml +++ b/.github/workflows/libclc-tests.yml @@ -36,5 +36,4 @@ jobs: name: Test libclc uses: ./.github/workflows/llvm-project-tests.yml with: - build_target: '' projects: clang;libclc diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml index ef5d7c7d581b7d..6bb9721956258f 100644 --- a/.github/workflows/lldb-tests.yml +++ b/.github/workflows/lldb-tests.yml @@ -36,5 +36,4 @@ jobs: name: Build lldb uses: ./.github/workflows/llvm-project-tests.yml with: - build_target: '' projects: clang;lldb diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml index a1404e1f1efa95..a52dd2db8035dd 100644 --- a/.github/workflows/llvm-project-tests.yml +++ b/.github/workflows/llvm-project-tests.yml @@ -14,7 +14,7 @@ on: required: false os_list: required: false - default: '["ubuntu-latest", "windows-2019", "macOS-11"]' + default: '["ubuntu-latest", "windows-2019", "macOS-13"]' python_version: required: false type: string @@ -22,8 +22,9 @@ on: workflow_call: inputs: build_target: - required: true + required: false type: string + default: "all" projects: required: true @@ -38,9 +39,7 @@ on: type: string # Use windows-2019 due to: # https://developercommunity.visualstudio.com/t/Prev-Issue---with-__assume-isnan-/1597317 - # We're using a specific version of macOS due to: - # https://github.com/actions/virtual-environments/issues/5900 - default: '["ubuntu-latest", "windows-2019", "macOS-11"]' + default: '["ubuntu-latest", "windows-2019", "macOS-13"]' python_version: required: false @@ -59,6 +58,10 @@ jobs: lit-tests: name: Lit Tests runs-on: ${{ matrix.os }} + container: + image: ${{(startsWith(matrix.os, 'ubuntu') && 'ghcr.io/llvm/ci-ubuntu-22.04:latest') || null}} + volumes: + - /mnt/:/mnt/ strategy: fail-fast: false matrix: @@ -78,6 +81,7 @@ jobs: with: python-version: ${{ inputs.python_version }} - name: Install Ninja + if: runner.os != 'Linux' uses: llvm/actions/install-ninja@main # actions/checkout deletes any existing files in the new git directory, # so this needs to either run before ccache-action or it has to use @@ -95,24 +99,51 @@ jobs: # run creates a new cache entry so we want to ensure that we have # enough cache space for all the tests to run at once and still # fit under the 10 GB limit. - max-size: 500M + # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174 + max-size: 2G key: ${{ matrix.os }} variant: sccache - name: Build and Test - uses: llvm/actions/build-test-llvm-project@main env: # Workaround for https://github.com/actions/virtual-environments/issues/5900. # This should be a no-op for non-mac OSes PKG_CONFIG_PATH: /usr/local/Homebrew/Library/Homebrew/os/mac/pkgconfig//12 - with: - cmake_args: '-GNinja -DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLDB_INCLUDE_TESTS=OFF -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache ${{ inputs.extra_cmake_args }}' - build_target: '${{ inputs.build_target }}' + shell: bash + id: build-llvm + run: | + if [ "${{ runner.os }}" == "Linux" ]; then + builddir="/mnt/build/" + mkdir -p $builddir + extra_cmake_args="-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang" + else + builddir="$(pwd)"/build + fi + if [ "${{ runner.os }}" == "macOS" ]; then + # Workaround test failure on some lld tests on MacOS + # https://github.com/llvm/llvm-project/issues/81967 + extra_cmake_args="-DLLVM_DISABLE_ASSEMBLY_FILES=ON" + fi + echo "llvm-builddir=$builddir" >> "$GITHUB_OUTPUT" + cmake -G Ninja \ + -B "$builddir" \ + -S llvm \ + -DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DLLDB_INCLUDE_TESTS=OFF \ + -DCMAKE_C_COMPILER_LAUNCHER=sccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ + $extra_cmake_args \ + ${{ inputs.extra_cmake_args }} + ninja -C "$builddir" '${{ inputs.build_target }}' - name: Build and Test libclc if: "!startsWith(matrix.os, 'windows') && contains(inputs.projects, 'libclc')" + env: + LLVM_BUILDDIR: ${{ steps.build-llvm.outputs.llvm-builddir }} run: | # Make sure all of LLVM libraries that llvm-config needs are built. - ninja -C build - cmake -G Ninja -S libclc -B libclc-build -DLLVM_DIR="$(pwd)"/build/lib/cmake/llvm -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl" + ninja -C "$LLVM_BUILDDIR" + cmake -G Ninja -S libclc -B libclc-build -DLLVM_DIR="$LLVM_BUILDDIR"/lib/cmake/llvm -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl" ninja -C libclc-build ninja -C libclc-build test diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml index cc9855ce182b2b..64d60bc3da45e1 100644 --- a/.github/workflows/llvm-tests.yml +++ b/.github/workflows/llvm-tests.yml @@ -27,31 +27,13 @@ concurrency: cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: - check_all: + check-all: if: github.repository_owner == 'llvm' - name: Test llvm,clang,libclc + name: Build and Test uses: ./.github/workflows/llvm-project-tests.yml with: build_target: check-all - projects: clang;libclc - - # These need to be separate from the check_all job, becuase there is not enough disk - # space to build all these projects on Windows. - build_lldb: - if: github.repository_owner == 'llvm' - name: Build lldb - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: '' - projects: clang;lldb - - check_lld: - if: github.repository_owner == 'llvm' - name: Test lld - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: check-lld - projects: lld + projects: clang;lld;libclc;lldb abi-dump-setup: if: github.repository_owner == 'llvm' @@ -60,6 +42,7 @@ jobs: BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }} ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }} BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }} + BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }} LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }} LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }} @@ -76,7 +59,14 @@ jobs: - name: Setup Variables id: vars run: | - if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 ] || [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then + # C++ ABI: + # 18.1.0 we aren't doing ABI checks. + # 18.1.1 We want to check 18.1.0. + # C ABI: + # 18.1.0 We want to check 17.0.x + # 18.1.1 We want to check 18.1.0 + echo "BASELINE_VERSION_MINOR=1" >> "$GITHUB_OUTPUT" + if [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then { echo "BASELINE_VERSION_MAJOR=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))" echo "ABI_HEADERS=llvm-c" @@ -100,7 +90,7 @@ jobs: include: - name: build-baseline llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }} - ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0 + ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MINOR }}.0 repo: llvm/llvm-project - name: build-latest llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }} @@ -143,7 +133,7 @@ jobs: else touch llvm.symbols fi - abi-dumper "$EXTRA_ARGS" -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so + abi-dumper $EXTRA_ARGS -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so # Remove symbol versioning from dumps, so we can compare across major versions. sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi - name: Upload ABI file @@ -193,7 +183,7 @@ jobs: # FIXME: Reading of gzip'd abi files on the GitHub runners stop # working some time in March of 2021, likely due to a change in the # runner's environment. - abi-compliance-checker "$EXTRA_ARGS" -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" + abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" - name: Upload ABI Comparison if: always() uses: actions/upload-artifact@v3 diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml index 5223089ee8a93d..3d0c23917bd403 100644 --- a/.github/workflows/pr-code-format.yml +++ b/.github/workflows/pr-code-format.yml @@ -1,5 +1,9 @@ name: "Check code formatting" -on: pull_request_target +on: + pull_request_target: + branches: + - main + permissions: pull-requests: write diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml index 36b0b6edd518fc..0316ba406041d6 100644 --- a/.github/workflows/release-lit.yml +++ b/.github/workflows/release-lit.yml @@ -58,7 +58,7 @@ jobs: cd llvm/utils/lit # Remove 'dev' suffix from lit version. sed -i 's/ + "dev"//g' lit/__init__.py - python3 setup.py sdist + python3 setup.py sdist bdist_wheel - name: Upload lit to test.pypi.org uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml index f2a831ad3577ad..53da8662b0203a 100644 --- a/.github/workflows/release-tasks.yml +++ b/.github/workflows/release-tasks.yml @@ -28,6 +28,7 @@ jobs: name: Create a New Release runs-on: ubuntu-latest needs: validate-tag + steps: - name: Install Dependencies run: | @@ -40,8 +41,9 @@ jobs: - name: Create Release env: GITHUB_TOKEN: ${{ github.token }} + USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }} run: | - ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} create + ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} --user-token "$USER_TOKEN" create release-documentation: name: Build and Upload Release Documentation needs: diff --git a/.github/workflows/version-check.py b/.github/workflows/version-check.py index 7f805f304e3d76..f75fd50300881b 100755 --- a/.github/workflows/version-check.py +++ b/.github/workflows/version-check.py @@ -16,7 +16,7 @@ def get_version_from_tag(tag): m = re.match("llvmorg-([0-9]+)-init", tag) if m: - return (m.group(1), "0", "0") + return (m.group(1), "1", "0") raise Exception(f"error: Tag is not valid: {tag}") diff --git a/clang-tools-extra/clangd/HeuristicResolver.cpp b/clang-tools-extra/clangd/HeuristicResolver.cpp index 3c147b6b582bf0..26d54200eeffd2 100644 --- a/clang-tools-extra/clangd/HeuristicResolver.cpp +++ b/clang-tools-extra/clangd/HeuristicResolver.cpp @@ -16,6 +16,80 @@ namespace clang { namespace clangd { +namespace { + +// Helper class for implementing HeuristicResolver. +// Unlike HeuristicResolver which is a long-lived class, +// a new instance of this class is created for every external +// call into a HeuristicResolver operation. That allows this +// class to store state that's local to such a top-level call, +// particularly "recursion protection sets" that keep track of +// nodes that have already been seen to avoid infinite recursion. +class HeuristicResolverImpl { +public: + HeuristicResolverImpl(ASTContext &Ctx) : Ctx(Ctx) {} + + // These functions match the public interface of HeuristicResolver + // (but aren't const since they may modify the recursion protection sets). + std::vector + resolveMemberExpr(const CXXDependentScopeMemberExpr *ME); + std::vector + resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE); + std::vector resolveTypeOfCallExpr(const CallExpr *CE); + std::vector resolveCalleeOfCallExpr(const CallExpr *CE); + std::vector + resolveUsingValueDecl(const UnresolvedUsingValueDecl *UUVD); + std::vector + resolveDependentNameType(const DependentNameType *DNT); + std::vector resolveTemplateSpecializationType( + const DependentTemplateSpecializationType *DTST); + const Type *resolveNestedNameSpecifierToType(const NestedNameSpecifier *NNS); + const Type *getPointeeType(const Type *T); + +private: + ASTContext &Ctx; + + // Recursion protection sets + llvm::SmallSet SeenDependentNameTypes; + + // Given a tag-decl type and a member name, heuristically resolve the + // name to one or more declarations. + // The current heuristic is simply to look up the name in the primary + // template. This is a heuristic because the template could potentially + // have specializations that declare different members. + // Multiple declarations could be returned if the name is overloaded + // (e.g. an overloaded method in the primary template). + // This heuristic will give the desired answer in many cases, e.g. + // for a call to vector::size(). + std::vector + resolveDependentMember(const Type *T, DeclarationName Name, + llvm::function_ref Filter); + + // Try to heuristically resolve the type of a possibly-dependent expression + // `E`. + const Type *resolveExprToType(const Expr *E); + std::vector resolveExprToDecls(const Expr *E); + + // Helper function for HeuristicResolver::resolveDependentMember() + // which takes a possibly-dependent type `T` and heuristically + // resolves it to a CXXRecordDecl in which we can try name lookup. + CXXRecordDecl *resolveTypeToRecordDecl(const Type *T); + + // This is a reimplementation of CXXRecordDecl::lookupDependentName() + // so that the implementation can call into other HeuristicResolver helpers. + // FIXME: Once HeuristicResolver is upstreamed to the clang libraries + // (https://github.com/clangd/clangd/discussions/1662), + // CXXRecordDecl::lookupDepenedentName() can be removed, and its call sites + // can be modified to benefit from the more comprehensive heuristics offered + // by HeuristicResolver instead. + std::vector + lookupDependentName(CXXRecordDecl *RD, DeclarationName Name, + llvm::function_ref Filter); + bool findOrdinaryMemberInDependentClasses(const CXXBaseSpecifier *Specifier, + CXXBasePath &Path, + DeclarationName Name); +}; + // Convenience lambdas for use as the 'Filter' parameter of // HeuristicResolver::resolveDependentMember(). const auto NoFilter = [](const NamedDecl *D) { return true; }; @@ -31,8 +105,6 @@ const auto TemplateFilter = [](const NamedDecl *D) { return isa(D); }; -namespace { - const Type *resolveDeclsToType(const std::vector &Decls, ASTContext &Ctx) { if (Decls.size() != 1) // Names an overload set -- just bail. @@ -46,12 +118,10 @@ const Type *resolveDeclsToType(const std::vector &Decls, return nullptr; } -} // namespace - // Helper function for HeuristicResolver::resolveDependentMember() // which takes a possibly-dependent type `T` and heuristically // resolves it to a CXXRecordDecl in which we can try name lookup. -CXXRecordDecl *HeuristicResolver::resolveTypeToRecordDecl(const Type *T) const { +CXXRecordDecl *HeuristicResolverImpl::resolveTypeToRecordDecl(const Type *T) { assert(T); // Unwrap type sugar such as type aliases. @@ -84,7 +154,7 @@ CXXRecordDecl *HeuristicResolver::resolveTypeToRecordDecl(const Type *T) const { return TD->getTemplatedDecl(); } -const Type *HeuristicResolver::getPointeeType(const Type *T) const { +const Type *HeuristicResolverImpl::getPointeeType(const Type *T) { if (!T) return nullptr; @@ -117,8 +187,8 @@ const Type *HeuristicResolver::getPointeeType(const Type *T) const { return FirstArg.getAsType().getTypePtrOrNull(); } -std::vector HeuristicResolver::resolveMemberExpr( - const CXXDependentScopeMemberExpr *ME) const { +std::vector HeuristicResolverImpl::resolveMemberExpr( + const CXXDependentScopeMemberExpr *ME) { // If the expression has a qualifier, try resolving the member inside the // qualifier's type. // Note that we cannot use a NonStaticFilter in either case, for a couple @@ -164,14 +234,14 @@ std::vector HeuristicResolver::resolveMemberExpr( return resolveDependentMember(BaseType, ME->getMember(), NoFilter); } -std::vector HeuristicResolver::resolveDeclRefExpr( - const DependentScopeDeclRefExpr *RE) const { +std::vector +HeuristicResolverImpl::resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE) { return resolveDependentMember(RE->getQualifier()->getAsType(), RE->getDeclName(), StaticFilter); } std::vector -HeuristicResolver::resolveTypeOfCallExpr(const CallExpr *CE) const { +HeuristicResolverImpl::resolveTypeOfCallExpr(const CallExpr *CE) { const auto *CalleeType = resolveExprToType(CE->getCallee()); if (!CalleeType) return {}; @@ -187,7 +257,7 @@ HeuristicResolver::resolveTypeOfCallExpr(const CallExpr *CE) const { } std::vector -HeuristicResolver::resolveCalleeOfCallExpr(const CallExpr *CE) const { +HeuristicResolverImpl::resolveCalleeOfCallExpr(const CallExpr *CE) { if (const auto *ND = dyn_cast_or_null(CE->getCalleeDecl())) { return {ND}; } @@ -195,29 +265,31 @@ HeuristicResolver::resolveCalleeOfCallExpr(const CallExpr *CE) const { return resolveExprToDecls(CE->getCallee()); } -std::vector HeuristicResolver::resolveUsingValueDecl( - const UnresolvedUsingValueDecl *UUVD) const { +std::vector HeuristicResolverImpl::resolveUsingValueDecl( + const UnresolvedUsingValueDecl *UUVD) { return resolveDependentMember(UUVD->getQualifier()->getAsType(), UUVD->getNameInfo().getName(), ValueFilter); } -std::vector HeuristicResolver::resolveDependentNameType( - const DependentNameType *DNT) const { +std::vector +HeuristicResolverImpl::resolveDependentNameType(const DependentNameType *DNT) { + if (auto [_, inserted] = SeenDependentNameTypes.insert(DNT); !inserted) + return {}; return resolveDependentMember( resolveNestedNameSpecifierToType(DNT->getQualifier()), DNT->getIdentifier(), TypeFilter); } std::vector -HeuristicResolver::resolveTemplateSpecializationType( - const DependentTemplateSpecializationType *DTST) const { +HeuristicResolverImpl::resolveTemplateSpecializationType( + const DependentTemplateSpecializationType *DTST) { return resolveDependentMember( resolveNestedNameSpecifierToType(DTST->getQualifier()), DTST->getIdentifier(), TemplateFilter); } std::vector -HeuristicResolver::resolveExprToDecls(const Expr *E) const { +HeuristicResolverImpl::resolveExprToDecls(const Expr *E) { if (const auto *ME = dyn_cast(E)) { return resolveMemberExpr(ME); } @@ -236,7 +308,7 @@ HeuristicResolver::resolveExprToDecls(const Expr *E) const { return {}; } -const Type *HeuristicResolver::resolveExprToType(const Expr *E) const { +const Type *HeuristicResolverImpl::resolveExprToType(const Expr *E) { std::vector Decls = resolveExprToDecls(E); if (!Decls.empty()) return resolveDeclsToType(Decls, Ctx); @@ -244,8 +316,8 @@ const Type *HeuristicResolver::resolveExprToType(const Expr *E) const { return E->getType().getTypePtr(); } -const Type *HeuristicResolver::resolveNestedNameSpecifierToType( - const NestedNameSpecifier *NNS) const { +const Type *HeuristicResolverImpl::resolveNestedNameSpecifierToType( + const NestedNameSpecifier *NNS) { if (!NNS) return nullptr; @@ -270,8 +342,6 @@ const Type *HeuristicResolver::resolveNestedNameSpecifierToType( return nullptr; } -namespace { - bool isOrdinaryMember(const NamedDecl *ND) { return ND->isInIdentifierNamespace(Decl::IDNS_Ordinary | Decl::IDNS_Tag | Decl::IDNS_Member); @@ -287,11 +357,9 @@ bool findOrdinaryMember(const CXXRecordDecl *RD, CXXBasePath &Path, return false; } -} // namespace - -bool HeuristicResolver::findOrdinaryMemberInDependentClasses( +bool HeuristicResolverImpl::findOrdinaryMemberInDependentClasses( const CXXBaseSpecifier *Specifier, CXXBasePath &Path, - DeclarationName Name) const { + DeclarationName Name) { CXXRecordDecl *RD = resolveTypeToRecordDecl(Specifier->getType().getTypePtr()); if (!RD) @@ -299,9 +367,9 @@ bool HeuristicResolver::findOrdinaryMemberInDependentClasses( return findOrdinaryMember(RD, Path, Name); } -std::vector HeuristicResolver::lookupDependentName( +std::vector HeuristicResolverImpl::lookupDependentName( CXXRecordDecl *RD, DeclarationName Name, - llvm::function_ref Filter) const { + llvm::function_ref Filter) { std::vector Results; // Lookup in the class. @@ -332,9 +400,9 @@ std::vector HeuristicResolver::lookupDependentName( return Results; } -std::vector HeuristicResolver::resolveDependentMember( +std::vector HeuristicResolverImpl::resolveDependentMember( const Type *T, DeclarationName Name, - llvm::function_ref Filter) const { + llvm::function_ref Filter) { if (!T) return {}; if (auto *ET = T->getAs()) { @@ -349,6 +417,44 @@ std::vector HeuristicResolver::resolveDependentMember( } return {}; } +} // namespace + +std::vector HeuristicResolver::resolveMemberExpr( + const CXXDependentScopeMemberExpr *ME) const { + return HeuristicResolverImpl(Ctx).resolveMemberExpr(ME); +} +std::vector HeuristicResolver::resolveDeclRefExpr( + const DependentScopeDeclRefExpr *RE) const { + return HeuristicResolverImpl(Ctx).resolveDeclRefExpr(RE); +} +std::vector +HeuristicResolver::resolveTypeOfCallExpr(const CallExpr *CE) const { + return HeuristicResolverImpl(Ctx).resolveTypeOfCallExpr(CE); +} +std::vector +HeuristicResolver::resolveCalleeOfCallExpr(const CallExpr *CE) const { + return HeuristicResolverImpl(Ctx).resolveCalleeOfCallExpr(CE); +} +std::vector HeuristicResolver::resolveUsingValueDecl( + const UnresolvedUsingValueDecl *UUVD) const { + return HeuristicResolverImpl(Ctx).resolveUsingValueDecl(UUVD); +} +std::vector HeuristicResolver::resolveDependentNameType( + const DependentNameType *DNT) const { + return HeuristicResolverImpl(Ctx).resolveDependentNameType(DNT); +} +std::vector +HeuristicResolver::resolveTemplateSpecializationType( + const DependentTemplateSpecializationType *DTST) const { + return HeuristicResolverImpl(Ctx).resolveTemplateSpecializationType(DTST); +} +const Type *HeuristicResolver::resolveNestedNameSpecifierToType( + const NestedNameSpecifier *NNS) const { + return HeuristicResolverImpl(Ctx).resolveNestedNameSpecifierToType(NNS); +} +const Type *HeuristicResolver::getPointeeType(const Type *T) const { + return HeuristicResolverImpl(Ctx).getPointeeType(T); +} } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/HeuristicResolver.h b/clang-tools-extra/clangd/HeuristicResolver.h index dc04123d37593c..dcc063bbc4adc0 100644 --- a/clang-tools-extra/clangd/HeuristicResolver.h +++ b/clang-tools-extra/clangd/HeuristicResolver.h @@ -77,43 +77,6 @@ class HeuristicResolver { private: ASTContext &Ctx; - - // Given a tag-decl type and a member name, heuristically resolve the - // name to one or more declarations. - // The current heuristic is simply to look up the name in the primary - // template. This is a heuristic because the template could potentially - // have specializations that declare different members. - // Multiple declarations could be returned if the name is overloaded - // (e.g. an overloaded method in the primary template). - // This heuristic will give the desired answer in many cases, e.g. - // for a call to vector::size(). - std::vector resolveDependentMember( - const Type *T, DeclarationName Name, - llvm::function_ref Filter) const; - - // Try to heuristically resolve the type of a possibly-dependent expression - // `E`. - const Type *resolveExprToType(const Expr *E) const; - std::vector resolveExprToDecls(const Expr *E) const; - - // Helper function for HeuristicResolver::resolveDependentMember() - // which takes a possibly-dependent type `T` and heuristically - // resolves it to a CXXRecordDecl in which we can try name lookup. - CXXRecordDecl *resolveTypeToRecordDecl(const Type *T) const; - - // This is a reimplementation of CXXRecordDecl::lookupDependentName() - // so that the implementation can call into other HeuristicResolver helpers. - // FIXME: Once HeuristicResolver is upstreamed to the clang libraries - // (https://github.com/clangd/clangd/discussions/1662), - // CXXRecordDecl::lookupDepenedentName() can be removed, and its call sites - // can be modified to benefit from the more comprehensive heuristics offered - // by HeuristicResolver instead. - std::vector lookupDependentName( - CXXRecordDecl *RD, DeclarationName Name, - llvm::function_ref Filter) const; - bool findOrdinaryMemberInDependentClasses(const CXXBaseSpecifier *Specifier, - CXXBasePath &Path, - DeclarationName Name) const; }; } // namespace clangd diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp index 5722ca8f66eb72..5f3687ac581017 100644 --- a/clang-tools-extra/clangd/InlayHints.cpp +++ b/clang-tools-extra/clangd/InlayHints.cpp @@ -651,16 +651,12 @@ class InlayHintVisitor : public RecursiveASTVisitor { // implied object argument ([over.call.func]), the list of provided // arguments is preceded by the implied object argument for the purposes of // this correspondence... - // - // However, we don't have the implied object argument - // for static operator() per clang::Sema::BuildCallToObjectOfClassType. llvm::ArrayRef Args = {E->getArgs(), E->getNumArgs()}; // We don't have the implied object argument through a function pointer // either. if (const CXXMethodDecl *Method = dyn_cast_or_null(Callee.Decl)) - if (Method->isInstance() && - (IsFunctor || Method->hasCXXExplicitFunctionObjectParameter())) + if (IsFunctor || Method->hasCXXExplicitFunctionObjectParameter()) Args = Args.drop_front(1); processCall(Callee, Args); return true; diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp index 29cff68cf03b2e..0af6036734ba53 100644 --- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp +++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp @@ -1009,6 +1009,33 @@ TEST_F(TargetDeclTest, DependentTypes) { )cpp"; EXPECT_DECLS("DependentTemplateSpecializationTypeLoc", "template struct B"); + + // Dependent name with recursive definition. We don't expect a + // result, but we shouldn't get into a stack overflow either. + Code = R"cpp( + template + struct waldo { + typedef typename waldo::type::[[next]] type; + }; + )cpp"; + EXPECT_DECLS("DependentNameTypeLoc"); + + // Similar to above but using mutually recursive templates. + Code = R"cpp( + template + struct odd; + + template + struct even { + using type = typename odd::type::next; + }; + + template + struct odd { + using type = typename even::type::[[next]]; + }; + )cpp"; + EXPECT_DECLS("DependentNameTypeLoc"); } TEST_F(TargetDeclTest, TypedefCascade) { diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 5758b5acbc0b56..8621444364fb20 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -51,21 +51,35 @@ Improvements to clangd Inlay hints ^^^^^^^^^^^ -Diagnostics -^^^^^^^^^^^ - -Semantic Highlighting -^^^^^^^^^^^^^^^^^^^^^ +- Type hints + * Improved heuristics for showing sugared vs. desguared types + * Some hints which provide no information (e.g. ````) are now omitted +- Parameter hints + * Parameter hints are now shown for calls through function pointers + * Parameter hints are now shown for calls to a class's ``operator()`` + * No longer show bogus parameter hints for some builtins like ``__builtin_dump_struct`` Compile flags ^^^^^^^^^^^^^ +- System include extractor (``--query-driver``) improvements + * The directory containing builtin headers is now excluded from extracted system includes + * Various flags which can affect the system includes (``--target``, ``--stdlib``, ``-specs``) are now forwarded to the driver + * Fixed a bug where clangd would sometimes try to call a driver that didn't have obj-c support with ``-x objective-c++-header`` + * The driver path is now dot-normalized before being compared to the ``--query-driver`` pattern + * ``--query-driver`` is now supported by ``clangd-indexer`` +- Fixed a regression in clangd 17 where response files would not be expanded + Hover ^^^^^ +- Hover now shows alignment info for fields and records + Code completion ^^^^^^^^^^^^^^^ +- Refined heuristics for determining whether the use of a function can be a call or not + Code actions ^^^^^^^^^^^^ @@ -75,15 +89,25 @@ Code actions Signature help ^^^^^^^^^^^^^^ +- Improved support for calls through function pointer types + Cross-references ^^^^^^^^^^^^^^^^ +- Improved support for C++20 concepts +- Find-references now works for labels +- Improvements to template heuristics + Objective-C ^^^^^^^^^^^ Miscellaneous ^^^^^^^^^^^^^ +- Various stability improvements, e.g. crash fixes +- Improved error recovery on invalid code +- Clangd now bails gracefully on assembly and IR source files + Improvements to clang-doc ------------------------- @@ -564,10 +588,15 @@ Changes in existing checks Removed checks ^^^^^^^^^^^^^^ -Improvements to include-fixer +Improvements to include-cleaner ----------------------------- -The improvements are... +- Support for ``--only-headers`` flag to limit analysis to headers matching a regex +- Recognizes references through ``concept``s +- Builtin headers are not analyzed +- Handling of references through ``friend`` declarations +- Fixes around handling of IWYU pragmas on stdlib headers +- Improved handling around references to/from template specializations Improvements to clang-include-fixer ----------------------------------- diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake index 1ca9138b980731..bd1f688d61a7ea 100644 --- a/clang/cmake/caches/Release.cmake +++ b/clang/cmake/caches/Release.cmake @@ -4,7 +4,7 @@ # General Options set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "") -set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "") +set(LLVM_RELEASE_ENABLE_PGO OFF CACHE BOOL "") set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "") diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 4dc0de3a90f265..0b887288fe2cb1 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -5277,15 +5277,9 @@ the configuration (without a prefix: ``Auto``). Possible values: * ``SBPO_Never`` (in configuration: ``Never``) - Never put a space before opening parentheses. - - .. code-block:: c++ - - void f() { - if(true) { - f(); - } - } + This is **deprecated** and replaced by ``Custom`` below, with all + ``SpaceBeforeParensOptions`` but ``AfterPlacementOperator`` set to + ``false``. * ``SBPO_ControlStatements`` (in configuration: ``ControlStatements``) Put a space before opening parentheses only after control statement @@ -5425,32 +5419,14 @@ the configuration (without a prefix: ``Auto``). void operator++ (int a); vs. void operator++(int a); object.operator++ (10); object.operator++(10); - * ``AfterPlacementOperatorStyle AfterPlacementOperator`` :versionbadge:`clang-format 18` - - Defines in which cases to put a space between ``new/delete`` operators - and opening parentheses. - - Possible values: - - * ``APO_Never`` (in configuration: ``Never``) - Remove space after ``new/delete`` operators and before ``(``. - - .. code-block:: c++ - - new(buf) T; - delete(buf) T; - - * ``APO_Always`` (in configuration: ``Always``) - Always add space after ``new/delete`` operators and before ``(``. + * ``bool AfterPlacementOperator`` If ``true``, put a space between operator ``new``/``delete`` and opening + parenthesis. - .. code-block:: c++ - - new (buf) T; - delete (buf) T; - - * ``APO_Leave`` (in configuration: ``Leave``) - Leave placement ``new/delete`` expressions as they are. + .. code-block:: c++ + true: false: + new (buf) T; vs. new(buf) T; + delete (buf) T; delete(buf) T; * ``bool AfterRequiresInClause`` If ``true``, put space between requires keyword in a requires clause and opening parentheses, if there is one. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 060bc7669b72a5..e533ecfd5aeba5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -58,12 +58,12 @@ code bases. To reduce such widespread breakages, as an extension, Clang accepts this code with an existing warning ``-Wambiguous-reversed-operator`` warning. - Fixes `GH `_. + Fixes `#53954 `_. - The CMake variable ``GCC_INSTALL_PREFIX`` (which sets the default ``--gcc-toolchain=``) is deprecated and will be removed. Specify ``--gcc-install-dir=`` or ``--gcc-triple=`` in a `configuration file - ` as a + `_ as a replacement. (`#77537 `_) @@ -95,7 +95,7 @@ C/C++ Language Potentially Breaking Changes - Fixed a bug in finding matching `operator!=` while adding reversed `operator==` as outlined in "The Equality Operator You Are Looking For" (`P2468 `_). - Fixes (`#68901: `_). + Fixes (`#68901 `_). C++ Specific Potentially Breaking Changes ----------------------------------------- @@ -139,16 +139,22 @@ C++ Specific Potentially Breaking Changes - Remove the hardcoded path to the imported modules for C++20 named modules. Now we require all the dependent modules to specified from the command line. - See (`#62707: `_). + See (`#62707 `_). - Forbid `import XXX;` in C++ to find module `XXX` comes from explicit clang modules. - See (`#64755: `_). + See (`#64755 `_). ABI Changes in This Version --------------------------- - Following the SystemV ABI for x86-64, ``__int128`` arguments will no longer be split between a register and a stack slot. +- Fixed Microsoft calling convention for returning certain classes with a + templated constructor. If a class has a templated constructor, it should + be returned indirectly even if it meets all the other requirements for + returning a class in a register. This affects some uses of std::pair. + (#GH86384). + AST Dumping Potentially Breaking Changes ---------------------------------------- - When dumping a sugared type, Clang will no longer print the desugared type if @@ -171,6 +177,22 @@ AST Dumping Potentially Breaking Changes "qualType": "foo" } +Clang Frontend Potentially Breaking Changes +------------------------------------------- +- Target OS macros extension + A new Clang extension (see :ref:`here `) is enabled for + Darwin (Apple platform) targets. Clang now defines ``TARGET_OS_*`` macros for + these targets, which could break existing code bases with improper checks for + the ``TARGET_OS_`` macros. For example, existing checks might fail to include + the ``TargetConditionals.h`` header from Apple SDKs and therefore leaving the + macros undefined and guarded code unexercised. + + Affected code should be checked to see if it's still intended for the specific + target and fixed accordingly. + + The extension can be turned off by the option ``-fno-define-target-os-macros`` + as a workaround. + What's New in Clang |release|? ============================== Some of the major new features and improvements to Clang are listed @@ -183,11 +205,17 @@ C++ Language Changes C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ -- Implemented `P1907R1 ` which extends allowed non-type template argument +- Implemented `P1907R1 `_ which extends allowed non-type template argument kinds with e.g. floating point values and pointers and references to subobjects. This feature is still experimental. Accordingly, ``__cpp_nontype_template_args`` was not updated. However, its support can be tested with ``__has_extension(cxx_generalized_nttp)``. +- Clang won't perform ODR checks for decls in the global module fragment any + more to ease the implementation and improve the user's using experience. + This follows the MSVC's behavior. Users interested in testing the more strict + behavior can use the flag '-Xclang -fno-skip-odr-check-in-gmf'. + (`#79240 `_). + C++23 Feature Support ^^^^^^^^^^^^^^^^^^^^^ - Implemented `P0847R7: Deducing this `_. Some related core issues were also @@ -234,15 +262,10 @@ C++2c Feature Support Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- Implemented `CWG2137 `_ which allows - list-initialization from objects of the same type. -- Implemented `CWG2311 `_: given a prvalue ``e`` of object type - ``T``, ``T{e}`` will try to resolve an initializer list constructor and will use it if successful (CWG2137). - Otherwise, if there is no initializer list constructor, the copy will be elided as if it was ``T(e)``. - Implemented `CWG2598 `_ and `CWG2096 `_, making unions (that have either no members or at least one literal member) literal types. - (`#77924: `_). + (`#77924 `_). C Language Changes @@ -303,6 +326,10 @@ Non-comprehensive list of changes in this release * The version of Unicode used by Clang (primarily to parse identifiers) has been updated to 15.1. +* Clang now defines macro ``__LLVM_INSTR_PROFILE_GENERATE`` when compiling with + PGO instrumentation profile generation, and ``__LLVM_INSTR_PROFILE_USE`` when + compiling with PGO profile use. + New Compiler Flags ------------------ @@ -343,6 +370,17 @@ New Compiler Flags attribute the replaceable global new and delete operators behave normally (like other functions) with respect to visibility attributes, pragmas and options (e.g ``--fvisibility=``). +* Full register names can be used when printing assembly via ``-mregnames``. + This option now matches the one used by GCC. + +.. _target_os_detail: + +* ``-fdefine-target-os-macros`` and its complement + ``-fno-define-target-os-macros``. Enables or disables the Clang extension to + provide built-in definitions of a list of ``TARGET_OS_*`` macros based on the + target triple. + + The extension is enabled by default for Darwin (Apple platform) targets. Deprecated Compiler Flags ------------------------- @@ -362,6 +400,7 @@ Modified Compiler Flags * ``-fvisibility-global-new-delete-hidden`` is now a deprecated spelling of ``-fvisibility-global-new-delete=force-hidden`` (``-fvisibility-global-new-delete=`` is new in this release). +* ``-fprofile-update`` is enabled for ``-fprofile-generate``. Removed Compiler Flags ------------------------- @@ -382,7 +421,7 @@ Attribute Changes in Clang types after default argument promotion. As a result, it's no longer an automatic diagnostic to use parameters of types that the format style supports but that are never the result of default argument promotion, such as - ``float``. (`#59824: `_) + ``float``. (`#59824 `_) - Clang now supports ``[[clang::preferred_type(type-name)]]`` as an attribute which can be applied to a bit-field. This attribute helps to map a bit-field @@ -450,13 +489,13 @@ Improvements to Clang's diagnostics - Clang's ``-Wtautological-negation-compare`` flag now diagnoses logical tautologies like ``x && !x`` and ``!x || x`` in expressions. This also makes ``-Winfinite-recursion`` diagnose more cases. - (`#56035: `_). + (`#56035 `_). - Clang constexpr evaluator now diagnoses compound assignment operators against uninitialized variables as a read of uninitialized object. (`#51536 `_) - Clang's ``-Wformat-truncation`` now diagnoses ``snprintf`` call that is known to result in string truncation. - (`#64871: `_). + (`#64871 `_). Existing warnings that similarly warn about the overflow in ``sprintf`` now falls under its own warning group ```-Wformat-overflow`` so that it can be disabled separately from ``Wfortify-source``. @@ -472,12 +511,12 @@ Improvements to Clang's diagnostics - Clang now emits ``-Wcast-qual`` for functional-style cast expressions. - Clang no longer emits irrelevant notes about unsatisfied constraint expressions on the left-hand side of ``||`` when the right-hand side constraint is satisfied. - (`#54678: `_). + (`#54678 `_). - Clang now prints its 'note' diagnostic in cyan instead of black, to be more compatible with terminals with dark background colors. This is also more consistent with GCC. - Clang now displays an improved diagnostic and a note when a defaulted special member is marked ``constexpr`` in a class with a virtual base class - (`#64843: `_). + (`#64843 `_). - ``-Wfixed-enum-extension`` and ``-Wmicrosoft-fixed-enum`` diagnostics are no longer emitted when building as C23, since C23 standardizes support for enums with a fixed underlying type. @@ -517,10 +556,10 @@ Improvements to Clang's diagnostics 1 | static_assert("A\n"[1] == U'🌍'); | ~~~~~~~~~^~~~~~~~ - Clang now always diagnoses when using non-standard layout types in ``offsetof`` . - (`#64619: `_) + (`#64619 `_) - Clang now diagnoses redefined defaulted constructor when redefined defaulted constructor with different exception specs. - (`#69094: `_) + (`#69094 `_) - Clang now diagnoses use of variable-length arrays in C++ by default (and under ``-Wall`` in GNU++ mode). This is an extension supported by Clang and GCC, but is very easy to accidentally use without realizing it's a @@ -596,7 +635,7 @@ Improvements to Clang's diagnostics - Clang now diagnoses definitions of friend function specializations, e.g. ``friend void f<>(int) {}``. - Clang now diagnoses narrowing conversions involving const references. - (`#63151: `_). + (`#63151 `_). - Clang now diagnoses unexpanded packs within the template argument lists of function template specializations. - The warning `-Wnan-infinity-disabled` is now emitted when ``INFINITY`` or ``NAN`` are used in arithmetic operations or function arguments in @@ -606,7 +645,7 @@ Improvements to Clang's diagnostics - Clang now diagnoses attempts to bind a bitfield to an NTTP of a reference type as erroneous converted constant expression and not as a reference to subobject. - Clang now diagnoses ``auto`` and ``decltype(auto)`` in declarations of conversion function template - (`CWG1878: `_) + (`CWG1878 `_) - Clang now diagnoses the requirement that non-template friend declarations with requires clauses and template friend declarations with a constraint that depends on a template parameter from an enclosing template must be a definition. @@ -637,15 +676,15 @@ Improvements to Clang's diagnostics - Clang now diagnoses import before module declarations but not in global module fragment. - (`#67627: `_). + (`#67627 `_). - Clang now diagnoses include headers with angle in module purviews, which is not usually intended. - (`#68615: `_) + (`#68615 `_) - Clang now won't mention invisible namespace when diagnose invisible declarations inside namespace. The original diagnostic message is confusing. - (`#73893: `_) + (`#73893 `_) Improvements to Clang's time-trace ---------------------------------- @@ -859,6 +898,16 @@ Bug Fixes in This Version Fixes (`#78290 `_) - Fixed assertion failure with deleted overloaded unary operators. Fixes (`#78314 `_) +- The XCOFF object file format does not support aliases to symbols having common + linkage. Clang now diagnoses the use of an alias for a common symbol when + compiling for AIX. + +- Clang now doesn't produce false-positive warning `-Wconstant-logical-operand` + for logical operators in C23. + Fixes (`#64356 `_). +- Clang's ``-Wshadow`` no longer warns when an init-capture is named the same as + a class field unless the lambda can capture this. + Fixes (`#71976 `_) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -923,7 +972,7 @@ Bug Fixes to C++ Support - Fix a crash when calling a non-constant immediate function in the initializer of a static data member. - (`#65985 _`). + (`#65985 `_). - Clang now properly converts static lambda call operator to function pointers on win32. (`#62594 `_) @@ -932,7 +981,7 @@ Bug Fixes to C++ Support of a function template or a member function of a class template was assigned the location of a non-defining declaration rather than the location of the definition the specialization was instantiated from. - (`#26057 `_`) + (`#26057 `_) - Fix a crash when a default member initializer of a base aggregate makes an invalid call to an immediate function. @@ -943,11 +992,11 @@ Bug Fixes to C++ Support (`#48527 `_) - Clang now no longer asserts when an UnresolvedLookupExpr is used as an - expression requirement. (`#66612 https://github.com/llvm/llvm-project/issues/66612`) + expression requirement. (`#66612 `_) - Clang now disambiguates NTTP types when printing diagnostics where the NTTP types are compared with the 'diff' method. - (`#66744 https://github.com/llvm/llvm-project/issues/66744`) + (`#66744 `_) - Fix crash caused by a spaceship operator returning a comparision category by reference. Fixes: @@ -1041,9 +1090,6 @@ Bug Fixes to C++ Support in different visibility. Fixes (`#67893 `_) -- Fix a false-positive ODR violation for different definitions for `std::align_val_t`. - Fixes (`#76638 `_) - - Remove recorded `#pragma once` state for headers included in named modules. Fixes (`#77995 `_) @@ -1054,6 +1100,25 @@ Bug Fixes to C++ Support Fixes (`#78830 `_) Fixes (`#60085 `_) + +- Fixed a bug where variables referenced by requires-clauses inside + nested generic lambdas were not properly injected into the constraint scope. + (`#73418 `_) + +- Fix incorrect code generation caused by the object argument of ``static operator()`` and ``static operator[]`` calls not being evaluated. + Fixes (`#67976 `_) + +- Fix crash when using an immediate-escalated function at global scope. + (`#82258 `_) +- Correctly immediate-escalate lambda conversion functions. + (`#82258 `_) +- Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator. + (`#53815 `_) + +- Fixed a regression in CTAD that a friend declaration that befriends itself may cause + incorrect constraint substitution. + (`#86769 `_) + Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed an import failure of recursive friend class template. @@ -1164,6 +1229,17 @@ Arm and AArch64 Support * Cortex-A720 (cortex-a720). * Cortex-X4 (cortex-x4). +- Alpha support has been added for SVE2.1 intrinsics. + +- Support has been added for `-fstack-clash-protection` and `-mstack-probe-size` + command line options. + +- Function Multi Versioning has been extended to support Load-Acquire RCpc + instructions v3 (rcpc3) as well as Memory Copy and Memory Set Acceleration + instructions (mops) when targeting AArch64. The feature identifiers (in + parenthesis) can be used with either of the ``target_version`` and + ``target_clones`` attributes. + Android Support ^^^^^^^^^^^^^^^ @@ -1195,6 +1271,8 @@ Windows Support linking may succeed but the resulting executables may expose issues at runtime. +- Clang now passes relevant LTO options to the linker (LLD) in MinGW mode. + LoongArch Support ^^^^^^^^^^^^^^^^^ - Added builtins support for all LSX (128-bits SIMD) and LASX (256-bits SIMD) @@ -1227,6 +1305,9 @@ RISC-V Support - Default ABI with F but without D was changed to ilp32f for RV32 and to lp64f for RV64. +- ``__attribute__((rvv_vector_bits(N)))`` is now supported for RVV vbool*_t types. +- ``-mtls-dialect=desc`` is now supported to enable TLS descriptors (TLSDESC). + CUDA/HIP Language Changes ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1236,6 +1317,16 @@ CUDA Support - Clang now supports CUDA SDK up to 12.3 - Added support for sm_90a +PowerPC Support +^^^^^^^^^^^^^^^ + +- Added ``nmmintrin.h`` to intrinsics headers. +- Added ``__builtin_ppc_fence`` as barrier of code motion, and + ``__builtin_ppc_mffsl`` for corresponding instruction. +- Supported ``__attribute__((target("tune=cpu")))``. +- Emit ``float-abi`` module flag on 64-bit ELFv2 PowerPC targets if + ``long double`` type is used in current module. + AIX Support ^^^^^^^^^^^ @@ -1244,6 +1335,15 @@ AIX Support base is encoded as an immediate operand. This access sequence is not used for TLS variables larger than 32KB, and is currently only supported on 64-bit mode. +- Inline assembler supports VSR register in pure digits. +- Enabled ThinLTO support. Requires AIX 7.2 TL5 SP7 or newer, or AIX 7.3 TL2 + or newer. Similar to the LTO support on AIX, ThinLTO is implemented with + the libLTO.so plugin. + +SystemZ Support +^^^^^^^^^^^^^^^ +- Properly support 16 byte atomic int/fp types and ops. Atomic __int128 (and + long double) variables are now aligned to 16 bytes by default (like gcc 14). WebAssembly Support ^^^^^^^^^^^^^^^^^^^ @@ -1307,6 +1407,8 @@ libclang - Exposed arguments of ``clang::annotate``. - ``clang::getCursorKindForDecl`` now recognizes linkage specifications such as ``extern "C"`` and reports them as ``CXCursor_LinkageSpec``. +- Changed the libclang library on AIX to export only the necessary symbols to + prevent issues of resolving to the wrong duplicate symbol. Static Analyzer --------------- @@ -1318,9 +1420,6 @@ New features of static analysis tools, such as the Clang Static Analyzer. `Documentation `__. -- Added support for the ``cleanup`` attribute. - `Documentation `__. - - Support "Deducing this" (P0847R7). (Worked out of the box) (`af4751738db8 `__) @@ -1381,6 +1480,10 @@ Crash and bug fixes - Fix false positive in mutation check when using pointer to member function. (`#66204 `_) +- Fixed a crash in ``security.cert.env.InvalidPtr`` checker when accidentally + matched user-defined ``strerror`` and similar library functions. + (`#88181 `_) + Improvements ^^^^^^^^^^^^ diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst index 81043ff25be02e..0f85065f464a8f 100644 --- a/clang/docs/StandardCPlusPlusModules.rst +++ b/clang/docs/StandardCPlusPlusModules.rst @@ -457,6 +457,29 @@ Note that **currently** the compiler doesn't consider inconsistent macro definit Currently Clang would accept the above example. But it may produce surprising results if the debugging code depends on consistent use of ``NDEBUG`` also in other translation units. +Definitions consistency +^^^^^^^^^^^^^^^^^^^^^^^ + +The C++ language defines that same declarations in different translation units should have +the same definition, as known as ODR (One Definition Rule). Prior to modules, the translation +units don't dependent on each other and the compiler itself can't perform a strong +ODR violation check. With the introduction of modules, now the compiler have +the chance to perform ODR violations with language semantics across translation units. + +However, in the practice, we found the existing ODR checking mechanism is not stable +enough. Many people suffers from the false positive ODR violation diagnostics, AKA, +the compiler are complaining two identical declarations have different definitions +incorrectly. Also the true positive ODR violations are rarely reported. +Also we learned that MSVC don't perform ODR check for declarations in the global module +fragment. + +So in order to get better user experience, save the time checking ODR and keep consistent +behavior with MSVC, we disabled the ODR check for the declarations in the global module +fragment by default. Users who want more strict check can still use the +``-Xclang -fno-skip-odr-check-in-gmf`` flag to get the ODR check enabled. It is also +encouraged to report issues if users find false positive ODR violations or false negative ODR +violations with the flag enabled. + ABI Impacts ----------- diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 9a4736019d1b1b..eb7a1a32060077 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -673,6 +673,16 @@ class alignas(8) Decl { /// fragment. See [module.global.frag]p3,4 for details. bool isDiscardedInGlobalModuleFragment() const { return false; } + /// Check if we should skip checking ODRHash for declaration \param D. + /// + /// The existing ODRHash mechanism seems to be not stable enough and + /// the false positive ODR violation reports are annoying and we rarely see + /// true ODR violation reports. Also we learned that MSVC disabled ODR checks + /// for declarations in GMF. So we try to disable ODR checks in the GMF to + /// get better user experiences before we make the ODR violation checks stable + /// enough. + bool shouldSkipCheckingODR() const; + /// Return true if this declaration has an attribute which acts as /// definition of the entity, such as 'alias' or 'ifunc'. bool hasDefiningAttr() const; diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index ea425791fc97f0..6384cf9420b82e 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -3495,6 +3495,9 @@ enum class VectorKind { /// is RISC-V RVV fixed-length data vector RVVFixedLengthData, + + /// is RISC-V RVV fixed-length mask vector + RVVFixedLengthMask, }; /// Represents a GCC generic vector type. This type is created using diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 58838b01b4fd7c..dbf2dd2120fb69 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1590,6 +1590,7 @@ def RegCall : DeclOrTypeAttr { } def Final : InheritableAttr { + let CanPrintOnLeft = 0; let Spellings = [CustomKeyword<"final">, CustomKeyword<"sealed">]; let Accessors = [Accessor<"isSpelledAsSealed", [CustomKeyword<"sealed">]>]; let SemaHandler = 0; @@ -2472,6 +2473,7 @@ def Overloadable : Attr { } def Override : InheritableAttr { + let CanPrintOnLeft = 0; let Spellings = [CustomKeyword<"override">]; let SemaHandler = 0; // Omitted from docs, since this is language syntax, not an attribute, as far diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 7e633f8e2635a9..e02a1201e2ad79 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2424,7 +2424,10 @@ only be a power of 2 between 64 and 65536. For types where LMUL!=1, ``__riscv_v_fixed_vlen`` needs to be scaled by the LMUL of the type before passing to the attribute. -``vbool*_t`` types are not supported at this time. +For ``vbool*_t`` types, ``__riscv_v_fixed_vlen`` needs to be divided by the +number from the type name. For example, ``vbool8_t`` needs to use +``__riscv_v_fixed_vlen`` / 8. If the resulting value is not a multiple of 8, +the type is not supported for that value of ``__riscv_v_fixed_vlen``. }]; } diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index d208342d9c516e..74dfd1d214e849 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,w TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +//===----------------------------------------------------------------------===// +// WMMA builtins. +// Postfix w32 indicates the builtin requires wavefront size of 32. +// Postfix w64 indicates the builtin requires wavefront size of 64. +// +// Some of these are very similar to their GFX11 counterparts, but they don't +// require replication of the A,B matrices, so they use fewer vector elements. +// Therefore, we add an "_gfx12" suffix to distinguish them from the existing +// builtins. +//===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32") +// These are gfx12-only, but for consistency with the other WMMA variants we're +// keeping the "_gfx12" suffix. +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") +// These are gfx12-only, but for consistency with the other WMMA variants we're +// keeping the "_gfx12" suffix. +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64") + +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 2f2e45d5cf63df..7c0bfe32849614 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -369,6 +369,9 @@ ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibr /// The default TLS model to use. ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel) +/// Whether to enable TLSDESC. AArch64 enables TLSDESC regardless of this value. +CODEGENOPT(EnableTLSDESC, 1, 0) + /// Bit size of immediate TLS offsets (0 == use the default). VALUE_CODEGENOPT(TLSSize, 8, 0) diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index b1bada65cb6b28..08bb1d81ba29f1 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -73,7 +73,7 @@ def warn_pragma_debug_unexpected_argument : Warning< def warn_fp_nan_inf_when_disabled : Warning< "use of %select{infinity|NaN}0%select{| via a macro}1 is undefined behavior " "due to the currently enabled floating-point options">, - InGroup>; + InGroup>; } // Parse && Sema diff --git a/clang/include/clang/Basic/DiagnosticDocs.td b/clang/include/clang/Basic/DiagnosticDocs.td index e9862422b4997e..8c024b5cad740a 100644 --- a/clang/include/clang/Basic/DiagnosticDocs.td +++ b/clang/include/clang/Basic/DiagnosticDocs.td @@ -87,3 +87,12 @@ program by treating all string literals as having type ``const char *`` instead of ``char *``. This can cause unexpected behaviors with type-sensitive constructs like ``_Generic``. }]; + +defvar NanInfDisabledDocs = [{ +This warning is enabled when source code using the macros ``INFINITY`` or ``NAN`` +is compiled with floating-point options preventing these two values. This can +lead to undefined behavior. Check the order of command line arguments that modify +this behavior, such as ``-ffast-math``, ``-fhonor-infinities``, and +``-fhonor-nans`` (etc), as well as ``#pragma`` directives if this diagnostic is +generated unexpectedly. +}]; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a1c32abb4dcd88..ef8c111b1d8cc8 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3711,6 +3711,12 @@ def err_sme_za_call_no_za_state : Error< "call to a shared ZA function requires the caller to have ZA state">; def err_sme_zt0_call_no_zt0_state : Error< "call to a shared ZT0 function requires the caller to have ZT0 state">; +def err_sme_unimplemented_za_save_restore : Error< + "call to a function that shares state other than 'za' from a " + "function that has live 'za' state requires a spill/fill of ZA, which is not yet " + "implemented">; +def note_sme_use_preserves_za : Note< + "add '__arm_preserves(\"za\")' to the callee if it preserves ZA">; def err_sme_definition_using_sm_in_non_sme_target : Error< "function executed in streaming-SVE mode requires 'sme'">; def err_sme_definition_using_za_in_non_sme_target : Error< diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 8fc75e1cca0399..4942dcaa086eac 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -174,6 +174,7 @@ LANGOPT(MathErrno , 1, 1, "errno in math functions") BENIGN_LANGOPT(HeinousExtensions , 1, 0, "extensions that we really don't like and may be ripped out at any time") LANGOPT(Modules , 1, 0, "modules semantics") COMPATIBLE_LANGOPT(CPlusPlusModules, 1, 0, "C++ modules syntax") +LANGOPT(SkipODRCheckInGMF, 1, 0, "Skip ODR checks for decls in the global module fragment") LANGOPT(BuiltinHeadersInSystemModules, 1, 0, "builtin headers belong to system modules, and _Builtin_ modules are ignored for cstdlib headers") BENIGN_ENUM_LANGOPT(CompilingModule, CompilingModuleKind, 3, CMK_None, "compiling a module interface") diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 695e1bddf9ffc6..2da0e8d2aba9a4 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -44,6 +44,7 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0 defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>; defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>; +let TargetGuard = "sme" in { def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "", [IsOverloadNone, IsStreamingCompatible, IsInOutZA], MemEltTyDefault, "aarch64_sme_ldr">; @@ -51,6 +52,7 @@ def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "", def SVLDR_ZA : MInst<"svldr_za", "vmQ", "", [IsOverloadNone, IsStreamingCompatible, IsInOutZA], MemEltTyDefault, "aarch64_sme_ldr", []>; +} //////////////////////////////////////////////////////////////////////////////// // Stores @@ -81,6 +83,7 @@ defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>; defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>; +let TargetGuard = "sme" in { def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "", [IsOverloadNone, IsStreamingCompatible, IsInZA], MemEltTyDefault, "aarch64_sme_str">; @@ -88,6 +91,7 @@ def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "", def SVSTR_ZA : MInst<"svstr_za", "vm%", "", [IsOverloadNone, IsStreamingCompatible, IsInZA], MemEltTyDefault, "aarch64_sme_str", []>; +} //////////////////////////////////////////////////////////////////////////////// // Read horizontal/vertical ZA slices @@ -277,22 +281,22 @@ multiclass ZAAddSub { def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsInOutZA], []>; def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsInOutZA], []>; + } - let TargetGuard = "sme-i16i64" in { - def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>; + let TargetGuard = "sme2,sme-i16i64" in { + def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; - } + def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; + } - let TargetGuard = "sme-f64f64" in { - def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; - } + let TargetGuard = "sme2,sme-f64f64" in { + def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; } } diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7f4fa33748faca..175bedbfb4d01c 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2985,6 +2985,14 @@ def fmodule_output : Flag<["-"], "fmodule-output">, Flags<[NoXarchOption]>, Visibility<[ClangOption, CC1Option]>, HelpText<"Save intermediate module file results when compiling a standard C++ module unit.">; +defm skip_odr_check_in_gmf : BoolOption<"f", "skip-odr-check-in-gmf", + LangOpts<"SkipODRCheckInGMF">, DefaultFalse, + PosFlag, + NegFlag>, + Group; + def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group, Visibility<[ClangOption, CC1Option]>, MetaVarName<"">, HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">, @@ -4419,6 +4427,8 @@ def mtls_size_EQ : Joined<["-"], "mtls-size=">, Group, HelpText<"Specify bit size of immediate TLS offsets (AArch64 ELF only): " "12 (for 4KB) | 24 (for 16MB, default) | 32 (for 4GB) | 48 (for 256TB, needs -mcmodel=large)">, MarshallingInfoInt>; +def mtls_dialect_EQ : Joined<["-"], "mtls-dialect=">, Group, + Flags<[TargetSpecific]>, HelpText<"Which thread-local storage dialect to use for dynamic accesses of TLS variables">; def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group; def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group; def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group; @@ -5805,6 +5815,18 @@ def mvis3 : Flag<["-"], "mvis3">, Group; def mno_vis3 : Flag<["-"], "mno-vis3">, Group; def mhard_quad_float : Flag<["-"], "mhard-quad-float">, Group; def msoft_quad_float : Flag<["-"], "msoft-quad-float">, Group; +foreach i = 1 ... 7 in + def ffixed_g#i : Flag<["-"], "ffixed-g"#i>, Group, + HelpText<"Reserve the G"#i#" register (SPARC only)">; +foreach i = 0 ... 5 in + def ffixed_o#i : Flag<["-"], "ffixed-o"#i>, Group, + HelpText<"Reserve the O"#i#" register (SPARC only)">; +foreach i = 0 ... 7 in + def ffixed_l#i : Flag<["-"], "ffixed-l"#i>, Group, + HelpText<"Reserve the L"#i#" register (SPARC only)">; +foreach i = 0 ... 5 in + def ffixed_i#i : Flag<["-"], "ffixed-i"#i>, Group, + HelpText<"Reserve the I"#i#" register (SPARC only)">; } // let Flags = [TargetSpecific] // M68k features flags @@ -7066,6 +7088,9 @@ def fexperimental_assignment_tracking_EQ : Joined<["-"], "fexperimental-assignme Values<"disabled,enabled,forced">, NormalizedValues<["Disabled","Enabled","Forced"]>, MarshallingInfoEnum, "Enabled">; +def enable_tlsdesc : Flag<["-"], "enable-tlsdesc">, + MarshallingInfoFlag>; + } // let Visibility = [CC1Option] //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index bc9eecd42f9ebf..e61619e90317ee 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -4157,14 +4157,9 @@ struct FormatStyle { /// Different ways to put a space before opening parentheses. enum SpaceBeforeParensStyle : int8_t { - /// Never put a space before opening parentheses. - /// \code - /// void f() { - /// if(true) { - /// f(); - /// } - /// } - /// \endcode + /// This is **deprecated** and replaced by ``Custom`` below, with all + /// ``SpaceBeforeParensOptions`` but ``AfterPlacementOperator`` set to + /// ``false``. SBPO_Never, /// Put a space before opening parentheses only after control statement /// keywords (``for/if/while...``). @@ -4273,28 +4268,14 @@ struct FormatStyle { /// object.operator++ (10); object.operator++(10); /// \endcode bool AfterOverloadedOperator; - /// Styles for adding spacing between ``new/delete`` operators and opening - /// parentheses. - enum AfterPlacementOperatorStyle : int8_t { - /// Remove space after ``new/delete`` operators and before ``(``. - /// \code - /// new(buf) T; - /// delete(buf) T; - /// \endcode - APO_Never, - /// Always add space after ``new/delete`` operators and before ``(``. - /// \code - /// new (buf) T; - /// delete (buf) T; - /// \endcode - APO_Always, - /// Leave placement ``new/delete`` expressions as they are. - APO_Leave, - }; - /// Defines in which cases to put a space between ``new/delete`` operators - /// and opening parentheses. - /// \version 18 - AfterPlacementOperatorStyle AfterPlacementOperator; + /// If ``true``, put a space between operator ``new``/``delete`` and opening + /// parenthesis. + /// \code + /// true: false: + /// new (buf) T; vs. new(buf) T; + /// delete (buf) T; delete(buf) T; + /// \endcode + bool AfterPlacementOperator; /// If ``true``, put space between requires keyword in a requires clause and /// opening parentheses, if there is one. /// \code @@ -4327,7 +4308,7 @@ struct FormatStyle { : AfterControlStatements(false), AfterForeachMacros(false), AfterFunctionDeclarationName(false), AfterFunctionDefinitionName(false), AfterIfMacros(false), - AfterOverloadedOperator(false), AfterPlacementOperator(APO_Leave), + AfterOverloadedOperator(false), AfterPlacementOperator(true), AfterRequiresInClause(false), AfterRequiresInExpression(false), BeforeNonEmptyParentheses(false) {} @@ -5212,7 +5193,8 @@ llvm::Expected getStyle(StringRef StyleName, StringRef FileName, StringRef FallbackStyle, StringRef Code = "", llvm::vfs::FileSystem *FS = nullptr, - bool AllowUnknownOptions = false); + bool AllowUnknownOptions = false, + llvm::SourceMgr::DiagHandlerTy DiagHandler = nullptr); // Guesses the language from the ``FileName`` and ``Code`` to be formatted. // Defaults to FormatStyle::LK_Cpp. diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 2d9c53cdf5bde8..b0a8ec0fec5e94 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -2828,7 +2828,8 @@ class Preprocessor { return AnnotationInfos.find(II)->second; } - void emitMacroExpansionWarnings(const Token &Identifier) const { + void emitMacroExpansionWarnings(const Token &Identifier, + bool IsIfnDef = false) const { IdentifierInfo *Info = Identifier.getIdentifierInfo(); if (Info->isDeprecatedMacro()) emitMacroDeprecationWarning(Identifier); @@ -2837,12 +2838,12 @@ class Preprocessor { !SourceMgr.isInMainFile(Identifier.getLocation())) emitRestrictExpansionWarning(Identifier); - if (Info->getName() == "INFINITY") - if (getLangOpts().NoHonorInfs) + if (!IsIfnDef) { + if (Info->getName() == "INFINITY" && getLangOpts().NoHonorInfs) emitRestrictInfNaNWarning(Identifier, 0); - if (Info->getName() == "NAN") - if (getLangOpts().NoHonorNaNs) + if (Info->getName() == "NAN" && getLangOpts().NoHonorNaNs) emitRestrictInfNaNWarning(Identifier, 1); + } } static void processPathForFileMacro(SmallVectorImpl &Path, diff --git a/clang/include/clang/Sema/Lookup.h b/clang/include/clang/Sema/Lookup.h index 9c93bf1e6fb428..2f2f2607a937fe 100644 --- a/clang/include/clang/Sema/Lookup.h +++ b/clang/include/clang/Sema/Lookup.h @@ -754,7 +754,8 @@ class LookupResult { private: void diagnoseAccess() { - if (isClassLookup() && getSema().getLangOpts().AccessControl) + if (!isAmbiguous() && isClassLookup() && + getSema().getLangOpts().AccessControl) getSema().CheckLookupAccess(*this); } diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h index 6eaa74382685ba..06e47eed4e93b6 100644 --- a/clang/include/clang/Sema/ScopeInfo.h +++ b/clang/include/clang/Sema/ScopeInfo.h @@ -925,8 +925,8 @@ class LambdaScopeInfo final : /// that were defined in parent contexts. Used to avoid warnings when the /// shadowed variables are uncaptured by this lambda. struct ShadowedOuterDecl { - const VarDecl *VD; - const VarDecl *ShadowedDecl; + const NamedDecl *VD; + const NamedDecl *ShadowedDecl; }; llvm::SmallVector ShadowingDecls; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 1f1cbd11ff7358..6adb8fb7966b3f 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -1090,7 +1090,9 @@ class Sema final { if (FD) { FD->setWillHaveBody(true); S.ExprEvalContexts.back().InImmediateFunctionContext = - FD->isImmediateFunction(); + FD->isImmediateFunction() || + S.ExprEvalContexts[S.ExprEvalContexts.size() - 2] + .isConstantEvaluated(); S.ExprEvalContexts.back().InImmediateEscalatingFunctionContext = S.getLangOpts().CPlusPlus20 && FD->isImmediateEscalating(); } else diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index dd1451bbf2d2c9..62c25f5b7a0df8 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -2451,7 +2451,6 @@ class BitsUnpacker { uint32_t Value; uint32_t CurrentBitsIndex = ~0; }; - } // namespace clang #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 5eb7aa3664569d..cc5de9a6295ebf 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -498,7 +498,11 @@ void ASTContext::attachCommentsToJustParsedDecls(ArrayRef Decls, return; FileID File; - for (Decl *D : Decls) { + for (const Decl *D : Decls) { + if (D->isInvalidDecl()) + continue; + + D = &adjustDeclToTemplate(*D); SourceLocation Loc = D->getLocation(); if (Loc.isValid()) { // See if there are any new comments that are not attached to a decl. @@ -1945,7 +1949,8 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { else if (VT->getVectorKind() == VectorKind::SveFixedLengthPredicate) // Adjust the alignment for fixed-length SVE predicates. Align = 16; - else if (VT->getVectorKind() == VectorKind::RVVFixedLengthData) + else if (VT->getVectorKind() == VectorKind::RVVFixedLengthData || + VT->getVectorKind() == VectorKind::RVVFixedLengthMask) // Adjust the alignment for fixed-length RVV vectors. Align = std::min(64, Width); break; @@ -9416,7 +9421,9 @@ bool ASTContext::areCompatibleVectorTypes(QualType FirstVec, Second->getVectorKind() != VectorKind::SveFixedLengthData && Second->getVectorKind() != VectorKind::SveFixedLengthPredicate && First->getVectorKind() != VectorKind::RVVFixedLengthData && - Second->getVectorKind() != VectorKind::RVVFixedLengthData) + Second->getVectorKind() != VectorKind::RVVFixedLengthData && + First->getVectorKind() != VectorKind::RVVFixedLengthMask && + Second->getVectorKind() != VectorKind::RVVFixedLengthMask) return true; return false; @@ -9522,8 +9529,11 @@ static uint64_t getRVVTypeSize(ASTContext &Context, const BuiltinType *Ty) { ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(Ty); - uint64_t EltSize = Context.getTypeSize(Info.ElementType); - uint64_t MinElts = Info.EC.getKnownMinValue(); + unsigned EltSize = Context.getTypeSize(Info.ElementType); + if (Info.ElementType == Context.BoolTy) + EltSize = 1; + + unsigned MinElts = Info.EC.getKnownMinValue(); return VScale->first * MinElts * EltSize; } @@ -9537,6 +9547,12 @@ bool ASTContext::areCompatibleRVVTypes(QualType FirstType, auto IsValidCast = [this](QualType FirstType, QualType SecondType) { if (const auto *BT = FirstType->getAs()) { if (const auto *VT = SecondType->getAs()) { + if (VT->getVectorKind() == VectorKind::RVVFixedLengthMask) { + BuiltinVectorTypeInfo Info = getBuiltinVectorTypeInfo(BT); + return FirstType->isRVVVLSBuiltinType() && + Info.ElementType == BoolTy && + getTypeSize(SecondType) == getRVVTypeSize(*this, BT); + } if (VT->getVectorKind() == VectorKind::RVVFixedLengthData || VT->getVectorKind() == VectorKind::Generic) return FirstType->isRVVVLSBuiltinType() && diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 26fdfa040796ed..1ee33fd7576d7d 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -4476,7 +4476,7 @@ unsigned FunctionDecl::getODRHash() { } class ODRHash Hash; - Hash.AddFunctionDecl(this); + Hash.AddFunctionDecl(this, /*SkipBody=*/shouldSkipCheckingODR()); setHasODRHash(true); ODRHash = Hash.CalculateHash(); return ODRHash; diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index 8163f9bdaf8d97..6b3c13ff206d23 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1102,6 +1102,11 @@ bool Decl::isInAnotherModuleUnit() const { return M != getASTContext().getCurrentNamedModule(); } +bool Decl::shouldSkipCheckingODR() const { + return getASTContext().getLangOpts().SkipODRCheckInGMF && getOwningModule() && + getOwningModule()->isExplicitGlobalModule(); +} + static Decl::Kind getKind(const Decl *D) { return D->getKind(); } static Decl::Kind getKind(const DeclContext *DC) { return DC->getDeclKind(); } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index f1d07d022b2584..edf9b5e2d52bb3 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -7951,7 +7951,8 @@ class ExprEvaluatorBase // Overloaded operator calls to member functions are represented as normal // calls with '*this' as the first argument. const CXXMethodDecl *MD = dyn_cast(FD); - if (MD && MD->isImplicitObjectMemberFunction()) { + if (MD && + (MD->isImplicitObjectMemberFunction() || (OCE && MD->isStatic()))) { // FIXME: When selecting an implicit conversion for an overloaded // operator delete, we sometimes try to evaluate calls to conversion // operators without a 'this' parameter! @@ -7960,7 +7961,11 @@ class ExprEvaluatorBase if (!EvaluateObjectArgument(Info, Args[0], ThisVal)) return false; - This = &ThisVal; + + // If we are calling a static operator, the 'this' argument needs to be + // ignored after being evaluated. + if (MD->isInstance()) + This = &ThisVal; // If this is syntactically a simple assignment using a trivial // assignment operator, start the lifetimes of union members as needed, diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 40b1e086ddd0c6..688141b30441e8 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -3994,7 +3994,8 @@ void CXXNameMangler::mangleAArch64FixedSveVectorType( } void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) { - assert(T->getVectorKind() == VectorKind::RVVFixedLengthData && + assert((T->getVectorKind() == VectorKind::RVVFixedLengthData || + T->getVectorKind() == VectorKind::RVVFixedLengthMask) && "expected fixed-length RVV vector!"); QualType EltType = T->getElementType(); @@ -4009,7 +4010,10 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) { TypeNameOS << "int8"; break; case BuiltinType::UChar: - TypeNameOS << "uint8"; + if (T->getVectorKind() == VectorKind::RVVFixedLengthData) + TypeNameOS << "uint8"; + else + TypeNameOS << "bool"; break; case BuiltinType::Short: TypeNameOS << "int16"; @@ -4048,12 +4052,16 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) { auto VScale = getASTContext().getTargetInfo().getVScaleRange( getASTContext().getLangOpts()); unsigned VLen = VScale->first * llvm::RISCV::RVVBitsPerBlock; - TypeNameOS << 'm'; - if (VecSizeInBits >= VLen) - TypeNameOS << (VecSizeInBits / VLen); - else - TypeNameOS << 'f' << (VLen / VecSizeInBits); + if (T->getVectorKind() == VectorKind::RVVFixedLengthData) { + TypeNameOS << 'm'; + if (VecSizeInBits >= VLen) + TypeNameOS << (VecSizeInBits / VLen); + else + TypeNameOS << 'f' << (VLen / VecSizeInBits); + } else { + TypeNameOS << (VLen / VecSizeInBits); + } TypeNameOS << "_t"; Out << "9__RVV_VLSI" << 'u' << TypeNameStr.size() << TypeNameStr << "Lj" @@ -4093,7 +4101,8 @@ void CXXNameMangler::mangleType(const VectorType *T) { T->getVectorKind() == VectorKind::SveFixedLengthPredicate) { mangleAArch64FixedSveVectorType(T); return; - } else if (T->getVectorKind() == VectorKind::RVVFixedLengthData) { + } else if (T->getVectorKind() == VectorKind::RVVFixedLengthData || + T->getVectorKind() == VectorKind::RVVFixedLengthMask) { mangleRISCVFixedRVVVectorType(T); return; } diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp index 3daba13d0fce7b..3c11b75d7472d9 100644 --- a/clang/lib/AST/JSONNodeDumper.cpp +++ b/clang/lib/AST/JSONNodeDumper.cpp @@ -703,6 +703,9 @@ void JSONNodeDumper::VisitVectorType(const VectorType *VT) { case VectorKind::RVVFixedLengthData: JOS.attribute("vectorKind", "fixed-length rvv data vector"); break; + case VectorKind::RVVFixedLengthMask: + JOS.attribute("vectorKind", "fixed-length rvv mask vector"); + break; } } diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp index 5b98646a1e8dc0..2dbc259138a897 100644 --- a/clang/lib/AST/ODRHash.cpp +++ b/clang/lib/AST/ODRHash.cpp @@ -745,55 +745,8 @@ void ODRHash::AddEnumDecl(const EnumDecl *Enum) { if (Enum->isScoped()) AddBoolean(Enum->isScopedUsingClassTag()); - if (Enum->getIntegerTypeSourceInfo()) { - // FIMXE: This allows two enums with different spellings to have the same - // hash. - // - // // mod1.cppm - // module; - // extern "C" { - // typedef unsigned __int64 size_t; - // } - // namespace std { - // using :: size_t; - // } - // - // extern "C++" { - // namespace std { - // enum class align_val_t : std::size_t {}; - // } - // } - // - // export module mod1; - // export using std::align_val_t; - // - // // mod2.cppm - // module; - // extern "C" { - // typedef unsigned __int64 size_t; - // } - // - // extern "C++" { - // namespace std { - // enum class align_val_t : size_t {}; - // } - // } - // - // export module mod2; - // import mod1; - // export using std::align_val_t; - // - // The above example should be disallowed since it violates - // [basic.def.odr]p14: - // - // Each such definition shall consist of the same sequence of tokens - // - // The definitions of `std::align_val_t` in two module units have different - // spellings but we failed to give an error here. - // - // See https://github.com/llvm/llvm-project/issues/76638 for details. + if (Enum->getIntegerTypeSourceInfo()) AddQualType(Enum->getIntegerType().getCanonicalType()); - } // Filter out sub-Decls which will not be processed in order to get an // accurate count of Decl's. diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp index 2bdbeb08ef2046..3310d7dc24c59d 100644 --- a/clang/lib/AST/TemplateBase.cpp +++ b/clang/lib/AST/TemplateBase.cpp @@ -450,7 +450,8 @@ bool TemplateArgument::structurallyEquals(const TemplateArgument &Other) const { getAsIntegral() == Other.getAsIntegral(); case StructuralValue: { - if (getStructuralValueType() != Other.getStructuralValueType()) + if (getStructuralValueType().getCanonicalType() != + Other.getStructuralValueType().getCanonicalType()) return false; llvm::FoldingSetNodeID A, B; diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 48c6729a673819..ecf5de0be543d7 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1623,6 +1623,9 @@ void TextNodeDumper::VisitVectorType(const VectorType *T) { case VectorKind::RVVFixedLengthData: OS << " fixed-length rvv data vector"; break; + case VectorKind::RVVFixedLengthMask: + OS << " fixed-length rvv mask vector"; + break; } OS << " " << T->getNumElements(); } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 3db5ae182f32c4..d4103025591e73 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2479,6 +2479,9 @@ bool Type::isRVVVLSBuiltinType() const { IsFP, IsBF) \ case BuiltinType::Id: \ return NF == 1; +#define RVV_PREDICATE_TYPE(Name, Id, SingletonId, NumEls) \ + case BuiltinType::Id: \ + return true; #include "clang/Basic/RISCVVTypes.def" default: return false; @@ -2491,7 +2494,17 @@ QualType Type::getRVVEltType(const ASTContext &Ctx) const { assert(isRVVVLSBuiltinType() && "unsupported type!"); const BuiltinType *BTy = castAs(); - return Ctx.getBuiltinVectorTypeInfo(BTy).ElementType; + + switch (BTy->getKind()) { +#define RVV_PREDICATE_TYPE(Name, Id, SingletonId, NumEls) \ + case BuiltinType::Id: \ + return Ctx.UnsignedCharTy; + default: + return Ctx.getBuiltinVectorTypeInfo(BTy).ElementType; +#include "clang/Basic/RISCVVTypes.def" + } + + llvm_unreachable("Unhandled type"); } bool QualType::isPODType(const ASTContext &Context) const { diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 80b42c8f84a00a..e9b6e810b02e8d 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -694,6 +694,7 @@ void TypePrinter::printVectorBefore(const VectorType *T, raw_ostream &OS) { printBefore(T->getElementType(), OS); break; case VectorKind::RVVFixedLengthData: + case VectorKind::RVVFixedLengthMask: // FIXME: We prefer to print the size directly here, but have no way // to get the size of the type. OS << "__attribute__((__riscv_rvv_vector_bits__("; @@ -773,6 +774,7 @@ void TypePrinter::printDependentVectorBefore( printBefore(T->getElementType(), OS); break; case VectorKind::RVVFixedLengthData: + case VectorKind::RVVFixedLengthMask: // FIXME: We prefer to print the size directly here, but have no way // to get the size of the type. OS << "__attribute__((__riscv_rvv_vector_bits__("; diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp index 925217431d4d02..0dac8748a98aff 100644 --- a/clang/lib/Basic/Module.cpp +++ b/clang/lib/Basic/Module.cpp @@ -301,10 +301,9 @@ bool Module::directlyUses(const Module *Requested) { if (Requested->isSubModuleOf(Use)) return true; - // Anyone is allowed to use our builtin stdarg.h and stddef.h and their - // accompanying modules. - if (Requested->getTopLevelModuleName() == "_Builtin_stdarg" || - Requested->getTopLevelModuleName() == "_Builtin_stddef") + // Anyone is allowed to use our builtin stddef.h and its accompanying modules. + if (Requested->fullModuleNameIs({"_Builtin_stddef", "max_align_t"}) || + Requested->fullModuleNameIs({"_Builtin_stddef_wint_t"})) return true; if (NoUndeclaredIncludes) diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index d47181bfca4fc8..f5a5d689fa095c 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -258,7 +258,6 @@ void AArch64TargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts, MacroBuilder &Builder) const { Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1"); Builder.defineMacro("__ARM_FEATURE_JCVT", "1"); - Builder.defineMacro("__ARM_FEATURE_PAUTH", "1"); // Also include the Armv8.2 defines getTargetDefinesARMV82A(Opts, Builder); } @@ -387,6 +386,11 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__ARM_ALIGN_MAX_STACK_PWR", "4"); + // These macros are set when Clang can parse declarations with these + // attributes. + Builder.defineMacro("__ARM_STATE_ZA", "1"); + Builder.defineMacro("__ARM_STATE_ZT0", "1"); + // 0xe implies support for half, single and double precision operations. if (FPU & FPUMode) Builder.defineMacro("__ARM_FP", "0xE"); @@ -431,6 +435,17 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2 && HasSVE2SM4) Builder.defineMacro("__ARM_FEATURE_SVE2_SM4", "1"); + if (HasSME) { + Builder.defineMacro("__ARM_FEATURE_SME"); + Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1"); + } + + if (HasSME2) { + Builder.defineMacro("__ARM_FEATURE_SME"); + Builder.defineMacro("__ARM_FEATURE_SME2"); + Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1"); + } + if (HasCRC) Builder.defineMacro("__ARM_FEATURE_CRC32", "1"); @@ -686,6 +701,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sme", HasSME) + .Case("sme2", HasSME2) .Case("sme-f64f64", HasSMEF64F64) .Case("sme-i16i64", HasSMEI16I64) .Case("sme-fa64", HasSMEFA64) @@ -806,6 +822,12 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, HasBFloat16 = true; HasFullFP16 = true; } + if (Feature == "+sme2") { + HasSME = true; + HasSME2 = true; + HasBFloat16 = true; + HasFullFP16 = true; + } if (Feature == "+sme-f64f64") { HasSME = true; HasSMEF64F64 = true; @@ -1164,6 +1186,8 @@ TargetInfo::BuiltinVaListKind AArch64TargetInfo::getBuiltinVaListKind() const { } const char *const AArch64TargetInfo::GCCRegNames[] = { + // clang-format off + // 32-bit Integer registers "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22", @@ -1200,7 +1224,12 @@ const char *const AArch64TargetInfo::GCCRegNames[] = { // SVE predicate-as-counter registers "pn0", "pn1", "pn2", "pn3", "pn4", "pn5", "pn6", "pn7", "pn8", - "pn9", "pn10", "pn11", "pn12", "pn13", "pn14", "pn15" + "pn9", "pn10", "pn11", "pn12", "pn13", "pn14", "pn15", + + // SME registers + "za", "zt0", + + // clang-format on }; ArrayRef AArch64TargetInfo::getGCCRegNames() const { diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index f0e0782e7abe97..9699222b0bf773 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -68,6 +68,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasCCDP = false; bool HasFRInt3264 = false; bool HasSME = false; + bool HasSME2 = false; bool HasSMEF64F64 = false; bool HasSMEI16I64 = false; bool HasSB = false; diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h index f46b95abfd75c7..23d4e1b598fa1e 100644 --- a/clang/lib/Basic/Targets/Mips.h +++ b/clang/lib/Basic/Targets/Mips.h @@ -237,12 +237,14 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo { case 'r': // CPU registers. case 'd': // Equivalent to "r" unless generating MIPS16 code. case 'y': // Equivalent to "r", backward compatibility only. - case 'f': // floating-point registers. case 'c': // $25 for indirect jumps case 'l': // lo register case 'x': // hilo register pair Info.setAllowsRegister(); return true; + case 'f': // floating-point registers. + Info.setAllowsRegister(); + return FloatABI != SoftFloat; case 'I': // Signed 16-bit constant case 'J': // Integer 0 case 'K': // Unsigned 16-bit constant diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index ec203f6f28bc17..4f22d35f9d3a94 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -186,6 +186,14 @@ class EmitAssemblyHelper { TargetTriple.getVendor() != llvm::Triple::Apple; } + /// Check whether we should emit a flag for UnifiedLTO. + /// The UnifiedLTO module flag should be set when UnifiedLTO is enabled for + /// ThinLTO or Full LTO with module summaries. + bool shouldEmitUnifiedLTOModueFlag() const { + return CodeGenOpts.UnifiedLTO && + (CodeGenOpts.PrepareForThinLTO || shouldEmitRegularLTOSummary()); + } + public: EmitAssemblyHelper(DiagnosticsEngine &_Diags, const HeaderSearchOptions &HeaderSearchOpts, @@ -401,6 +409,7 @@ static bool initTargetOptions(DiagnosticsEngine &Diags, Options.UniqueBasicBlockSectionNames = CodeGenOpts.UniqueBasicBlockSectionNames; Options.TLSSize = CodeGenOpts.TLSSize; + Options.EnableTLSDESC = CodeGenOpts.EnableTLSDESC; Options.EmulatedTLS = CodeGenOpts.EmulatedTLS; Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning(); Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection; @@ -1028,7 +1037,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (!actionRequiresCodeGen(Action) && CodeGenOpts.VerifyModule) MPM.addPass(VerifierPass()); - if (Action == Backend_EmitBC || Action == Backend_EmitLL) { + if (Action == Backend_EmitBC || Action == Backend_EmitLL || + CodeGenOpts.FatLTO) { if (CodeGenOpts.PrepareForThinLTO && !CodeGenOpts.DisableLLVMPasses) { if (!TheModule->getModuleFlag("EnableSplitLTOUnit")) TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit", @@ -1039,11 +1049,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (!ThinLinkOS) return; } - if (CodeGenOpts.UnifiedLTO) - TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1)); MPM.addPass(ThinLTOBitcodeWriterPass( *OS, ThinLinkOS ? &ThinLinkOS->os() : nullptr)); - } else { + } else if (Action == Backend_EmitLL) { MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists, /*EmitLTOSummary=*/true)); } @@ -1057,24 +1065,17 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (!TheModule->getModuleFlag("EnableSplitLTOUnit")) TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit", uint32_t(1)); - if (CodeGenOpts.UnifiedLTO) - TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1)); } - if (Action == Backend_EmitBC) + if (Action == Backend_EmitBC) { MPM.addPass(BitcodeWriterPass(*OS, CodeGenOpts.EmitLLVMUseLists, EmitLTOSummary)); - else + } else if (Action == Backend_EmitLL) { MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists, EmitLTOSummary)); + } } - } - if (CodeGenOpts.FatLTO) { - // Set the EnableSplitLTOUnit and UnifiedLTO module flags, since FatLTO - // uses a different action than Backend_EmitBC or Backend_EmitLL. - if (!TheModule->getModuleFlag("EnableSplitLTOUnit")) - TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit", - uint32_t(CodeGenOpts.EnableSplitLTOUnit)); - if (CodeGenOpts.UnifiedLTO && !TheModule->getModuleFlag("UnifiedLTO")) + + if (shouldEmitUnifiedLTOModueFlag()) TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1)); } diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 7ef764b8e1ac80..44ddd2428b10f5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -823,29 +823,32 @@ const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField( ASTContext &Ctx, const RecordDecl *RD, StringRef Name, uint64_t &Offset) { const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel = getLangOpts().getStrictFlexArraysLevel(); - unsigned FieldNo = 0; - bool IsUnion = RD->isUnion(); + uint32_t FieldNo = 0; - for (const Decl *D : RD->decls()) { - if (const auto *Field = dyn_cast(D); - Field && (Name.empty() || Field->getNameAsString() == Name) && + if (RD->isImplicit()) + return nullptr; + + for (const FieldDecl *FD : RD->fields()) { + if ((Name.empty() || FD->getNameAsString() == Name) && Decl::isFlexibleArrayMemberLike( - Ctx, Field, Field->getType(), StrictFlexArraysLevel, + Ctx, FD, FD->getType(), StrictFlexArraysLevel, /*IgnoreTemplateOrMacroSubstitution=*/true)) { const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD); Offset += Layout.getFieldOffset(FieldNo); - return Field; + return FD; } - if (const auto *Record = dyn_cast(D)) - if (const FieldDecl *Field = - FindFlexibleArrayMemberField(Ctx, Record, Name, Offset)) { + QualType Ty = FD->getType(); + if (Ty->isRecordType()) { + if (const FieldDecl *Field = FindFlexibleArrayMemberField( + Ctx, Ty->getAsRecordDecl(), Name, Offset)) { const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD); Offset += Layout.getFieldOffset(FieldNo); return Field; } + } - if (!IsUnion && isa(D)) + if (!RD->isUnion()) ++FieldNo; } @@ -18279,65 +18282,216 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: - case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: { + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: { // These operations perform a matrix multiplication and accumulation of // the form: // D = A * B + C - // The return type always matches the type of matrix C. - unsigned ArgForMatchingRetType; + // We need to specify one type for matrices AB and one for matrices CD. + // Sparse matrix operations can have different types for A and B as well as + // an additional type for sparsity index. + // Destination type should be put before types used for source operands. + SmallVector ArgsForMatchingMatrixTypes; + // On GFX12, the intrinsics with 16-bit accumulator use a packed layout. + // There is no need for the variable opsel argument, so always set it to + // "false". + bool AppendFalseForOpselArg = false; unsigned BuiltinWMMAOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64: - ArgForMatchingRetType = 2; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64: - ArgForMatchingRetType = 2; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; break; + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12: + AppendFalseForOpselArg = true; + LLVM_FALLTHROUGH; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16; break; + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12: + AppendFalseForOpselArg = true; + LLVM_FALLTHROUGH; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; break; case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32: case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64: - ArgForMatchingRetType = 2; + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: - ArgForMatchingRetType = 4; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8; break; case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32: case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64: - ArgForMatchingRetType = 4; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4; break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12: + ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12: + case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12: + ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64: + ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8; + break; + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: + case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: + ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index + BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8; + break; } SmallVector Args; for (int i = 0, e = E->getNumArgs(); i != e; ++i) Args.push_back(EmitScalarExpr(E->getArg(i))); + if (AppendFalseForOpselArg) + Args.push_back(Builder.getFalse()); - Function *F = CGM.getIntrinsic(BuiltinWMMAOp, - {Args[ArgForMatchingRetType]->getType()}); + SmallVector ArgTypes; + for (auto ArgIdx : ArgsForMatchingMatrixTypes) + ArgTypes.push_back(Args[ArgIdx]->getType()); + Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes); return Builder.CreateCall(F, Args); } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 28c211aa631e4d..a6a2f3595fe7db 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1581,6 +1581,11 @@ bool CodeGenModule::ReturnTypeUsesSRet(const CGFunctionInfo &FI) { return RI.isIndirect() || (RI.isInAlloca() && RI.getInAllocaSRet()); } +bool CodeGenModule::ReturnTypeHasInReg(const CGFunctionInfo &FI) { + const auto &RI = FI.getReturnInfo(); + return RI.getInReg(); +} + bool CodeGenModule::ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI) { return ReturnTypeUsesSRet(FI) && getTargetCodeGenInfo().doesReturnSlotInterfereWithArgs(); diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index bbe14ef4c17244..aa9997b87ecfa7 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -1241,27 +1241,38 @@ static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D, return; } - // If the initializer is small, use a handful of stores. + // If the initializer is small or trivialAutoVarInit is set, use a handful of + // stores. + bool IsTrivialAutoVarInitPattern = + CGM.getContext().getLangOpts().getTrivialAutoVarInit() == + LangOptions::TrivialAutoVarInitKind::Pattern; if (shouldSplitConstantStore(CGM, ConstantSize)) { if (auto *STy = dyn_cast(Ty)) { - const llvm::StructLayout *Layout = - CGM.getDataLayout().getStructLayout(STy); - for (unsigned i = 0; i != constant->getNumOperands(); i++) { - CharUnits CurOff = CharUnits::fromQuantity(Layout->getElementOffset(i)); - Address EltPtr = Builder.CreateConstInBoundsByteGEP( - Loc.withElementType(CGM.Int8Ty), CurOff); - emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder, - constant->getAggregateElement(i), IsAutoInit); + if (STy == Loc.getElementType() || + (STy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) { + const llvm::StructLayout *Layout = + CGM.getDataLayout().getStructLayout(STy); + for (unsigned i = 0; i != constant->getNumOperands(); i++) { + CharUnits CurOff = + CharUnits::fromQuantity(Layout->getElementOffset(i)); + Address EltPtr = Builder.CreateConstInBoundsByteGEP( + Loc.withElementType(CGM.Int8Ty), CurOff); + emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder, + constant->getAggregateElement(i), IsAutoInit); + } + return; } - return; } else if (auto *ATy = dyn_cast(Ty)) { - for (unsigned i = 0; i != ATy->getNumElements(); i++) { - Address EltPtr = Builder.CreateConstGEP( - Loc.withElementType(ATy->getElementType()), i); - emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder, - constant->getAggregateElement(i), IsAutoInit); + if (ATy == Loc.getElementType() || + (ATy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) { + for (unsigned i = 0; i != ATy->getNumElements(); i++) { + Address EltPtr = Builder.CreateConstGEP( + Loc.withElementType(ATy->getElementType()), i); + emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder, + constant->getAggregateElement(i), IsAutoInit); + } + return; } - return; } } diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index c5f6b6d3a99f0b..f8f9979099775f 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -5846,6 +5846,7 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee // destruction order is not necessarily reverse construction order. // FIXME: Revisit this based on C++ committee response to unimplementability. EvaluationOrder Order = EvaluationOrder::Default; + bool StaticOperator = false; if (auto *OCE = dyn_cast(E)) { if (OCE->isAssignmentOp()) Order = EvaluationOrder::ForceRightToLeft; @@ -5863,10 +5864,22 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee break; } } + + if (const auto *MD = + dyn_cast_if_present(OCE->getCalleeDecl()); + MD && MD->isStatic()) + StaticOperator = true; } - EmitCallArgs(Args, dyn_cast(FnType), E->arguments(), - E->getDirectCallee(), /*ParamsToSkip*/ 0, Order); + auto Arguments = E->arguments(); + if (StaticOperator) { + // If we're calling a static operator, we need to emit the object argument + // and ignore it. + EmitIgnoredExpr(E->getArg(0)); + Arguments = drop_begin(Arguments, 1); + } + EmitCallArgs(Args, dyn_cast(FnType), Arguments, + E->getDirectCallee(), /*ParamsToSkip=*/0, Order); const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeFreeFunctionCall( Args, FnType, /*ChainCall=*/Chain); diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp index a36b0cdddaf0af..05e3f8d4bfc2a3 100644 --- a/clang/lib/CodeGen/CGObjCGNU.cpp +++ b/clang/lib/CodeGen/CGObjCGNU.cpp @@ -2903,23 +2903,29 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF, break; case CodeGenOptions::Mixed: case CodeGenOptions::NonLegacy: + StringRef name = "objc_msgSend"; if (CGM.ReturnTypeUsesFPRet(ResultType)) { - imp = - CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true), - "objc_msgSend_fpret") - .getCallee(); + name = "objc_msgSend_fpret"; } else if (CGM.ReturnTypeUsesSRet(MSI.CallInfo)) { - // The actual types here don't matter - we're going to bitcast the - // function anyway - imp = - CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true), - "objc_msgSend_stret") - .getCallee(); - } else { - imp = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IdTy, IdTy, true), "objc_msgSend") - .getCallee(); + name = "objc_msgSend_stret"; + + // The address of the memory block is be passed in x8 for POD type, + // or in x0 for non-POD type (marked as inreg). + bool shouldCheckForInReg = + CGM.getContext() + .getTargetInfo() + .getTriple() + .isWindowsMSVCEnvironment() && + CGM.getContext().getTargetInfo().getTriple().isAArch64(); + if (shouldCheckForInReg && CGM.ReturnTypeHasInReg(MSI.CallInfo)) { + name = "objc_msgSend_stret2"; + } } + // The actual types here don't matter - we're going to bitcast the + // function anyway + imp = CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true), + name) + .getCallee(); } // Reset the receiver in case the lookup modified it diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index ec34680fd3f7e6..d9ece4d98eecae 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1239,6 +1239,9 @@ class CodeGenModule : public CodeGenTypeCache { /// Return true iff the given type uses 'sret' when used as a return type. bool ReturnTypeUsesSRet(const CGFunctionInfo &FI); + /// Return true iff the given type has `inreg` set. + bool ReturnTypeHasInReg(const CGFunctionInfo &FI); + /// Return true iff the given type uses an argument slot when 'sret' is used /// as a return type. bool ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI); diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 5d7c3847745762..fb4e86e8bd8053 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -240,9 +240,12 @@ struct MapRegionCounters : public RecursiveASTVisitor { if (MCDCMaxCond == 0) return true; - /// At the top of the logical operator nest, reset the number of conditions. - if (LogOpStack.empty()) + /// At the top of the logical operator nest, reset the number of conditions, + /// also forget previously seen split nesting cases. + if (LogOpStack.empty()) { NumCond = 0; + SplitNestedLogicalOp = false; + } if (const Expr *E = dyn_cast(S)) { const BinaryOperator *BinOp = dyn_cast(E->IgnoreParens()); @@ -293,7 +296,7 @@ struct MapRegionCounters : public RecursiveASTVisitor { "contains an operation with a nested boolean expression. " "Expression will not be covered"); Diag.Report(S->getBeginLoc(), DiagID); - return false; + return true; } /// Was the maximum number of conditions encountered? @@ -304,7 +307,7 @@ struct MapRegionCounters : public RecursiveASTVisitor { "number of conditions (%0) exceeds max (%1). " "Expression will not be covered"); Diag.Report(S->getBeginLoc(), DiagID) << NumCond << MCDCMaxCond; - return false; + return true; } // Otherwise, allocate the number of bytes required for the bitmap diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 5eca00f22bb83c..ae4e6d4c88c02d 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -1207,6 +1207,12 @@ struct CounterCoverageMappingBuilder /// Find a valid gap range between \p AfterLoc and \p BeforeLoc. std::optional findGapAreaBetween(SourceLocation AfterLoc, SourceLocation BeforeLoc) { + // Some statements (like AttributedStmt and ImplicitValueInitExpr) don't + // have valid source locations. Do not emit a gap region if this is the case + // in either AfterLoc end or BeforeLoc end. + if (AfterLoc.isInvalid() || BeforeLoc.isInvalid()) + return std::nullopt; + // If AfterLoc is in function-like macro, use the right parenthesis // location. if (AfterLoc.isMacroID()) { @@ -1370,9 +1376,8 @@ struct CounterCoverageMappingBuilder for (const Stmt *Child : S->children()) if (Child) { // If last statement contains terminate statements, add a gap area - // between the two statements. Skipping attributed statements, because - // they don't have valid start location. - if (LastStmt && HasTerminateStmt && !isa(Child)) { + // between the two statements. + if (LastStmt && HasTerminateStmt) { auto Gap = findGapAreaBetween(getEnd(LastStmt), getStart(Child)); if (Gap) fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), @@ -1812,8 +1817,10 @@ struct CounterCoverageMappingBuilder assert(S->isConstexpr()); // evaluate constant condition... - const auto *E = cast(S->getCond()); - const bool isTrue = E->getResultAsAPSInt().getExtValue(); + const bool isTrue = + S->getCond() + ->EvaluateKnownConstInt(CVM.getCodeGenModule().getContext()) + .getBoolValue(); extendRegion(S); diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index 172c4c937b9728..4d0f4c63f843b8 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -1135,9 +1135,15 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty, return false; if (RD->hasNonTrivialCopyAssignment()) return false; - for (const CXXConstructorDecl *Ctor : RD->ctors()) - if (Ctor->isUserProvided()) - return false; + for (const Decl *D : RD->decls()) { + if (auto *Ctor = dyn_cast(D)) { + if (Ctor->isUserProvided()) + return false; + } else if (auto *Template = dyn_cast(D)) { + if (isa(Template->getTemplatedDecl())) + return false; + } + } if (RD->hasNonTrivialDestructor()) return false; return true; diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp index 0851d1993d0c0f..02c86ad2e58cac 100644 --- a/clang/lib/CodeGen/Targets/RISCV.cpp +++ b/clang/lib/CodeGen/Targets/RISCV.cpp @@ -321,20 +321,28 @@ ABIArgInfo RISCVABIInfo::coerceVLSVector(QualType Ty) const { assert(Ty->isVectorType() && "expected vector type!"); const auto *VT = Ty->castAs(); - assert(VT->getVectorKind() == VectorKind::RVVFixedLengthData && - "Unexpected vector kind"); - assert(VT->getElementType()->isBuiltinType() && "expected builtin type!"); auto VScale = getContext().getTargetInfo().getVScaleRange(getContext().getLangOpts()); + + unsigned NumElts = VT->getNumElements(); + llvm::Type *EltType; + if (VT->getVectorKind() == VectorKind::RVVFixedLengthMask) { + NumElts *= 8; + EltType = llvm::Type::getInt1Ty(getVMContext()); + } else { + assert(VT->getVectorKind() == VectorKind::RVVFixedLengthData && + "Unexpected vector kind"); + EltType = CGT.ConvertType(VT->getElementType()); + } + // The MinNumElts is simplified from equation: // NumElts / VScale = // (EltSize * NumElts / (VScale * RVVBitsPerBlock)) // * (RVVBitsPerBlock / EltSize) llvm::ScalableVectorType *ResType = - llvm::ScalableVectorType::get(CGT.ConvertType(VT->getElementType()), - VT->getNumElements() / VScale->first); + llvm::ScalableVectorType::get(EltType, NumElts / VScale->first); return ABIArgInfo::getDirect(ResType); } @@ -437,7 +445,8 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed, } if (const VectorType *VT = Ty->getAs()) - if (VT->getVectorKind() == VectorKind::RVVFixedLengthData) + if (VT->getVectorKind() == VectorKind::RVVFixedLengthData || + VT->getVectorKind() == VectorKind::RVVFixedLengthMask) return coerceVLSVector(Ty); // Aggregates which are <= 2*XLen will be passed in registers if possible, diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 7109faa1072de5..93cddf742d521d 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4764,9 +4764,9 @@ Action *Driver::ConstructPhaseAction( case phases::Backend: { if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) { types::ID Output; - if (Args.hasArg(options::OPT_ffat_lto_objects)) - Output = Args.hasArg(options::OPT_emit_llvm) ? types::TY_LTO_IR - : types::TY_PP_Asm; + if (Args.hasArg(options::OPT_ffat_lto_objects) && + !Args.hasArg(options::OPT_emit_llvm)) + Output = types::TY_PP_Asm; else if (Args.hasArg(options::OPT_S)) Output = types::TY_LTO_IR; else diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp index 22e583021515e5..ae1a4ba7882627 100644 --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp @@ -178,4 +178,85 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args, else Features.push_back("-hard-quad-float"); } + + if (Args.hasArg(options::OPT_ffixed_g1)) + Features.push_back("+reserve-g1"); + + if (Args.hasArg(options::OPT_ffixed_g2)) + Features.push_back("+reserve-g2"); + + if (Args.hasArg(options::OPT_ffixed_g3)) + Features.push_back("+reserve-g3"); + + if (Args.hasArg(options::OPT_ffixed_g4)) + Features.push_back("+reserve-g4"); + + if (Args.hasArg(options::OPT_ffixed_g5)) + Features.push_back("+reserve-g5"); + + if (Args.hasArg(options::OPT_ffixed_g6)) + Features.push_back("+reserve-g6"); + + if (Args.hasArg(options::OPT_ffixed_g7)) + Features.push_back("+reserve-g7"); + + if (Args.hasArg(options::OPT_ffixed_o0)) + Features.push_back("+reserve-o0"); + + if (Args.hasArg(options::OPT_ffixed_o1)) + Features.push_back("+reserve-o1"); + + if (Args.hasArg(options::OPT_ffixed_o2)) + Features.push_back("+reserve-o2"); + + if (Args.hasArg(options::OPT_ffixed_o3)) + Features.push_back("+reserve-o3"); + + if (Args.hasArg(options::OPT_ffixed_o4)) + Features.push_back("+reserve-o4"); + + if (Args.hasArg(options::OPT_ffixed_o5)) + Features.push_back("+reserve-o5"); + + if (Args.hasArg(options::OPT_ffixed_l0)) + Features.push_back("+reserve-l0"); + + if (Args.hasArg(options::OPT_ffixed_l1)) + Features.push_back("+reserve-l1"); + + if (Args.hasArg(options::OPT_ffixed_l2)) + Features.push_back("+reserve-l2"); + + if (Args.hasArg(options::OPT_ffixed_l3)) + Features.push_back("+reserve-l3"); + + if (Args.hasArg(options::OPT_ffixed_l4)) + Features.push_back("+reserve-l4"); + + if (Args.hasArg(options::OPT_ffixed_l5)) + Features.push_back("+reserve-l5"); + + if (Args.hasArg(options::OPT_ffixed_l6)) + Features.push_back("+reserve-l6"); + + if (Args.hasArg(options::OPT_ffixed_l7)) + Features.push_back("+reserve-l7"); + + if (Args.hasArg(options::OPT_ffixed_i0)) + Features.push_back("+reserve-i0"); + + if (Args.hasArg(options::OPT_ffixed_i1)) + Features.push_back("+reserve-i1"); + + if (Args.hasArg(options::OPT_ffixed_i2)) + Features.push_back("+reserve-i2"); + + if (Args.hasArg(options::OPT_ffixed_i3)) + Features.push_back("+reserve-i3"); + + if (Args.hasArg(options::OPT_ffixed_i4)) + Features.push_back("+reserve-i4"); + + if (Args.hasArg(options::OPT_ffixed_i5)) + Features.push_back("+reserve-i5"); } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5dc614e11aab59..aa344b3465ab27 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3942,6 +3942,10 @@ static bool RenderModulesOptions(Compilation &C, const Driver &D, Args.ClaimAllArgs(options::OPT_fmodules_disable_diagnostic_validation); } + // FIXME: We provisionally don't check ODR violations for decls in the global + // module fragment. + CmdArgs.push_back("-fskip-odr-check-in-gmf"); + // Claim `-fmodule-output` and `-fmodule-output=` to avoid unused warnings. Args.ClaimAllArgs(options::OPT_fmodule_output); Args.ClaimAllArgs(options::OPT_fmodule_output_EQ); @@ -5779,6 +5783,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // NVPTX/AMDGPU does not care about the code model and will accept // whatever works for the host. Ok = true; + } else if (Triple.isSPARC64()) { + if (CM == "medlow") + CM = "small"; + else if (CM == "medmid") + CM = "medium"; + else if (CM == "medany") + CM = "large"; + Ok = CM == "small" || CM == "medium" || CM == "large"; } if (Ok) { CmdArgs.push_back(Args.MakeArgString("-mcmodel=" + CM)); @@ -5822,6 +5834,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_mtls_size_EQ); } + if (isTLSDESCEnabled(TC, Args)) + CmdArgs.push_back("-enable-tlsdesc"); + // Add the target cpu std::string CPU = getCPUName(D, Args, Triple, /*FromAs*/ false); if (!CPU.empty()) { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index fadaf3e60c6616..2b916f0003368d 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -729,6 +729,33 @@ bool tools::isUseSeparateSections(const llvm::Triple &Triple) { return Triple.isPS(); } +bool tools::isTLSDESCEnabled(const ToolChain &TC, + const llvm::opt::ArgList &Args) { + const llvm::Triple &Triple = TC.getEffectiveTriple(); + Arg *A = Args.getLastArg(options::OPT_mtls_dialect_EQ); + if (!A) + return Triple.hasDefaultTLSDESC(); + StringRef V = A->getValue(); + bool SupportedArgument = false, EnableTLSDESC = false; + bool Unsupported = !Triple.isOSBinFormatELF(); + if (Triple.isRISCV()) { + SupportedArgument = V == "desc" || V == "trad"; + EnableTLSDESC = V == "desc"; + } else if (Triple.isX86()) { + SupportedArgument = V == "gnu"; + } else { + Unsupported = true; + } + if (Unsupported) { + TC.getDriver().Diag(diag::err_drv_unsupported_opt_for_target) + << A->getSpelling() << Triple.getTriple(); + } else if (!SupportedArgument) { + TC.getDriver().Diag(diag::err_drv_unsupported_option_argument_for_target) + << A->getSpelling() << V << Triple.getTriple(); + } + return EnableTLSDESC; +} + void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, const InputInfo &Input, bool IsThinLTO) { @@ -783,6 +810,28 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, "-generate-arange-section")); } + // Pass vector library arguments to LTO. + Arg *ArgVecLib = Args.getLastArg(options::OPT_fveclib); + if (ArgVecLib && ArgVecLib->getNumValues() == 1) { + // Map the vector library names from clang front-end to opt front-end. The + // values are taken from the TargetLibraryInfo class command line options. + std::optional OptVal = + llvm::StringSwitch>(ArgVecLib->getValue()) + .Case("Accelerate", "Accelerate") + .Case("LIBMVEC", "LIBMVEC-X86") + .Case("MASSV", "MASSV") + .Case("SVML", "SVML") + .Case("SLEEF", "sleefgnuabi") + .Case("Darwin_libsystem_m", "Darwin_libsystem_m") + .Case("ArmPL", "ArmPL") + .Case("none", "none") + .Default(std::nullopt); + + if (OptVal) + CmdArgs.push_back(Args.MakeArgString( + Twine(PluginOptPrefix) + "-vector-library=" + OptVal.value())); + } + // Try to pass driver level flags relevant to LTO code generation down to // the plugin. @@ -988,6 +1037,9 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-emulated-tls")); } + if (isTLSDESCEnabled(ToolChain, Args)) + CmdArgs.push_back( + Args.MakeArgString(Twine(PluginOptPrefix) + "-enable-tlsdesc")); if (Args.hasFlag(options::OPT_fstack_size_section, options::OPT_fno_stack_size_section, false)) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index 25d68345a9f9eb..807867f13a5c30 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -144,6 +144,9 @@ llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args, bool areOptimizationsEnabled(const llvm::opt::ArgList &Args); bool isUseSeparateSections(const llvm::Triple &Triple); +// Parse -mtls-dialect=. Return true if the target supports both general-dynamic +// and TLSDESC, and TLSDESC is requested. +bool isTLSDESCEnabled(const ToolChain &TC, const llvm::opt::ArgList &Args); /// \p EnvVar is split by system delimiter for environment variables. /// If \p ArgName is "-I", "-L", or an empty string, each entry from \p EnvVar diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp index 57f4600727ec89..0b16b660364f07 100644 --- a/clang/lib/Driver/ToolChains/WebAssembly.cpp +++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp @@ -44,8 +44,15 @@ std::string wasm::Linker::getLinkerPath(const ArgList &Args) const { llvm::sys::fs::can_execute(UseLinker)) return std::string(UseLinker); - // Accept 'lld', and 'ld' as aliases for the default linker - if (UseLinker != "lld" && UseLinker != "ld") + // Interpret 'lld' as explicitly requesting `wasm-ld`, so look for that + // linker. Note that for `wasm32-wasip2` this overrides the default linker + // of `wasm-component-ld`. + if (UseLinker == "lld") { + return ToolChain.GetProgramPath("wasm-ld"); + } + + // Allow 'ld' as an alias for the default linker + if (UseLinker != "ld") ToolChain.getDriver().Diag(diag::err_drv_invalid_linker_name) << A->getAsString(Args); } @@ -73,6 +80,16 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (Args.hasArg(options::OPT_s)) CmdArgs.push_back("--strip-all"); + // On `wasip2` the default linker is `wasm-component-ld` which wraps the + // execution of `wasm-ld`. Find `wasm-ld` and pass it as an argument of where + // to find it to avoid it needing to hunt and rediscover or search `PATH` for + // where it is. + if (llvm::sys::path::stem(Linker).ends_with_insensitive( + "wasm-component-ld")) { + CmdArgs.push_back("--wasm-ld-path"); + CmdArgs.push_back(Args.MakeArgString(ToolChain.GetProgramPath("wasm-ld"))); + } + Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_u}); ToolChain.AddFilePathLibArgs(Args, CmdArgs); @@ -221,6 +238,12 @@ WebAssembly::WebAssembly(const Driver &D, const llvm::Triple &Triple, } } +const char *WebAssembly::getDefaultLinker() const { + if (getOS() == "wasip2") + return "wasm-component-ld"; + return "wasm-ld"; +} + bool WebAssembly::IsMathErrnoDefault() const { return false; } bool WebAssembly::IsObjCNonFragileABIDefault() const { return true; } diff --git a/clang/lib/Driver/ToolChains/WebAssembly.h b/clang/lib/Driver/ToolChains/WebAssembly.h index ae60f464c10818..76e0ca39bd748d 100644 --- a/clang/lib/Driver/ToolChains/WebAssembly.h +++ b/clang/lib/Driver/ToolChains/WebAssembly.h @@ -67,7 +67,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly final : public ToolChain { llvm::opt::ArgStringList &CmdArgs) const override; SanitizerMask getSupportedSanitizers() const override; - const char *getDefaultLinker() const override { return "wasm-ld"; } + const char *getDefaultLinker() const override; CXXStdlibType GetDefaultCXXStdlibType() const override { return ToolChain::CST_Libcxx; diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index a3eb9138b21833..53cd169b05904a 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -674,7 +674,13 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // arguments to function calls. We do this by ensuring that either all // arguments (including any lambdas) go on the same line as the function // call, or we break before the first argument. - auto PrevNonComment = Current.getPreviousNonComment(); + const auto *Prev = Current.Previous; + if (!Prev) + return false; + // For example, `/*Newline=*/false`. + if (Prev->is(TT_BlockComment) && Current.SpacesRequiredBefore == 0) + return false; + const auto *PrevNonComment = Current.getPreviousNonComment(); if (!PrevNonComment || PrevNonComment->isNot(tok::l_paren)) return false; if (Current.isOneOf(tok::comment, tok::l_paren, TT_LambdaLSquare)) diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index ff326dc784783b..0bbce92e962d49 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -504,22 +504,6 @@ struct ScalarEnumerationTraits { } }; -template <> -struct MappingTraits< - FormatStyle::SpaceBeforeParensCustom::AfterPlacementOperatorStyle> { - static void - mapping(IO &IO, - FormatStyle::SpaceBeforeParensCustom::AfterPlacementOperatorStyle - &Value) { - IO.enumCase(Value, "Always", - FormatStyle::SpaceBeforeParensCustom::APO_Always); - IO.enumCase(Value, "Never", - FormatStyle::SpaceBeforeParensCustom::APO_Never); - IO.enumCase(Value, "Leave", - FormatStyle::SpaceBeforeParensCustom::APO_Leave); - } -}; - template <> struct MappingTraits { static void mapping(IO &IO, FormatStyle::RawStringFormat &Format) { IO.mapOptional("Language", Format.Language); @@ -1388,12 +1372,9 @@ static void expandPresetsSpaceBeforeParens(FormatStyle &Expanded) { return; // Reset all flags Expanded.SpaceBeforeParensOptions = {}; + Expanded.SpaceBeforeParensOptions.AfterPlacementOperator = true; switch (Expanded.SpaceBeforeParens) { - case FormatStyle::SBPO_Never: - Expanded.SpaceBeforeParensOptions.AfterPlacementOperator = - FormatStyle::SpaceBeforeParensCustom::APO_Never; - break; case FormatStyle::SBPO_ControlStatements: Expanded.SpaceBeforeParensOptions.AfterControlStatements = true; Expanded.SpaceBeforeParensOptions.AfterForeachMacros = true; @@ -1405,8 +1386,6 @@ static void expandPresetsSpaceBeforeParens(FormatStyle &Expanded) { case FormatStyle::SBPO_NonEmptyParentheses: Expanded.SpaceBeforeParensOptions.BeforeNonEmptyParentheses = true; break; - case FormatStyle::SBPO_Always: - break; default: break; } @@ -3942,12 +3921,13 @@ const char *DefaultFallbackStyle = "LLVM"; llvm::ErrorOr> loadAndParseConfigFile(StringRef ConfigFile, llvm::vfs::FileSystem *FS, - FormatStyle *Style, bool AllowUnknownOptions) { + FormatStyle *Style, bool AllowUnknownOptions, + llvm::SourceMgr::DiagHandlerTy DiagHandler = nullptr) { llvm::ErrorOr> Text = FS->getBufferForFile(ConfigFile.str()); if (auto EC = Text.getError()) return EC; - if (auto EC = parseConfiguration(*Text.get(), Style, AllowUnknownOptions)) + if (auto EC = parseConfiguration(*Text.get(), Style, AllowUnknownOptions, DiagHandler)) return EC; return Text; } @@ -3955,7 +3935,8 @@ loadAndParseConfigFile(StringRef ConfigFile, llvm::vfs::FileSystem *FS, llvm::Expected getStyle(StringRef StyleName, StringRef FileName, StringRef FallbackStyleName, StringRef Code, llvm::vfs::FileSystem *FS, - bool AllowUnknownOptions) { + bool AllowUnknownOptions, + llvm::SourceMgr::DiagHandlerTy DiagHandler) { FormatStyle Style = getLLVMStyle(guessLanguage(FileName, Code)); FormatStyle FallbackStyle = getNoStyle(); if (!getPredefinedStyle(FallbackStyleName, Style.Language, &FallbackStyle)) @@ -3969,7 +3950,7 @@ llvm::Expected getStyle(StringRef StyleName, StringRef FileName, StringRef Source = ""; if (std::error_code ec = parseConfiguration(llvm::MemoryBufferRef(StyleName, Source), &Style, - AllowUnknownOptions)) { + AllowUnknownOptions, DiagHandler)) { return make_string_error("Error parsing -style: " + ec.message()); } @@ -3989,7 +3970,7 @@ llvm::Expected getStyle(StringRef StyleName, StringRef FileName, StyleName.starts_with_insensitive("file:")) { auto ConfigFile = StyleName.substr(5); llvm::ErrorOr> Text = - loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions); + loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions, DiagHandler); if (auto EC = Text.getError()) { return make_string_error("Error reading " + ConfigFile + ": " + EC.message()); @@ -4024,12 +4005,13 @@ llvm::Expected getStyle(StringRef StyleName, StringRef FileName, // Reset possible inheritance Style.InheritsParentConfig = false; - auto dropDiagnosticHandler = [](const llvm::SMDiagnostic &, void *) {}; + auto diagHandlerOrDropHandling = + DiagHandler ? DiagHandler : [](llvm::SMDiagnostic const &, void *) {}; auto applyChildFormatTexts = [&](FormatStyle *Style) { for (const auto &MemBuf : llvm::reverse(ChildFormatTextToApply)) { auto EC = parseConfiguration(*MemBuf, Style, AllowUnknownOptions, - dropDiagnosticHandler); + diagHandlerOrDropHandling); // It was already correctly parsed. assert(!EC); static_cast(EC); @@ -4063,7 +4045,7 @@ llvm::Expected getStyle(StringRef StyleName, StringRef FileName, } llvm::ErrorOr> Text = - loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions); + loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions, DiagHandler); if (auto EC = Text.getError()) { if (EC != ParseError::Unsuitable) { return make_string_error("Error reading " + ConfigFile + ": " + diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 25fcceb8786437..c1f16624819223 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2488,6 +2488,8 @@ class AnnotatingParser { (Tok.Next->Next->is(tok::numeric_constant) || Line.InPPDirective)) { return false; } + if (Line.InPPDirective && Tok.Next->is(tok::minus)) + return false; // Search for unexpected tokens. for (FormatToken *Prev = Tok.Previous; Prev != Tok.MatchingParen; Prev = Prev->Previous) { @@ -3448,10 +3450,11 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { for (AnnotatedLine *ChildLine : Line.Children) calculateFormattingInformation(*ChildLine); - Line.First->TotalLength = - Line.First->IsMultiline ? Style.ColumnLimit - : Line.FirstStartColumn + Line.First->ColumnWidth; - FormatToken *Current = Line.First->Next; + auto *First = Line.First; + First->TotalLength = First->IsMultiline + ? Style.ColumnLimit + : Line.FirstStartColumn + First->ColumnWidth; + FormatToken *Current = First->Next; bool InFunctionDecl = Line.MightBeFunctionDecl; bool AlignArrayOfStructures = (Style.AlignArrayOfStructures != FormatStyle::AIAS_None && @@ -3473,16 +3476,15 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { if (const bool IsCtorOrDtor = Tok->is(TT_CtorDtorDeclName); IsCtorOrDtor || isFunctionDeclarationName(Style.isCpp(), *Tok, Line, ClosingParen)) { - if (!IsCtorOrDtor) { - LineIsFunctionDeclaration = true; + if (!IsCtorOrDtor) Tok->setFinalizedType(TT_FunctionDeclarationName); - } + LineIsFunctionDeclaration = true; SeenName = true; break; } } - if (IsCpp && LineIsFunctionDeclaration && + if (IsCpp && (LineIsFunctionDeclaration || First->is(TT_CtorDtorDeclName)) && Line.endsWith(tok::semi, tok::r_brace)) { auto *Tok = Line.Last->Previous; while (Tok->isNot(tok::r_brace)) @@ -3505,7 +3507,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { if (IsCpp) { if (!LineIsFunctionDeclaration) { // Annotate */&/&& in `operator` function calls as binary operators. - for (const auto *Tok = Line.First; Tok; Tok = Tok->Next) { + for (const auto *Tok = First; Tok; Tok = Tok->Next) { if (Tok->isNot(tok::kw_operator)) continue; do { @@ -3530,6 +3532,8 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { } } else if (ClosingParen) { for (auto *Tok = ClosingParen->Next; Tok; Tok = Tok->Next) { + if (Tok->is(TT_CtorInitializerColon)) + break; if (Tok->is(tok::arrow)) { Tok->setType(TT_TrailingReturnArrow); break; @@ -3642,7 +3646,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { calculateUnbreakableTailLengths(Line); unsigned IndentLevel = Line.Level; - for (Current = Line.First; Current; Current = Current->Next) { + for (Current = First; Current; Current = Current->Next) { if (Current->Role) Current->Role->precomputeFormattingInfos(Current); if (Current->MatchingParen && @@ -4272,14 +4276,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, Left.isOneOf(tok::kw_new, tok::kw_delete) && Right.isNot(TT_OverloadedOperatorLParen) && !(Line.MightBeFunctionDecl && Left.is(TT_FunctionDeclarationName))) { - if (Style.SpaceBeforeParensOptions.AfterPlacementOperator == - FormatStyle::SpaceBeforeParensCustom::APO_Always || - (Style.SpaceBeforeParensOptions.AfterPlacementOperator == - FormatStyle::SpaceBeforeParensCustom::APO_Leave && - Right.hasWhitespaceBefore())) { - return true; - } - return false; + return Style.SpaceBeforeParensOptions.AfterPlacementOperator; } if (Line.Type == LT_ObjCDecl) return true; @@ -5162,12 +5159,8 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, return true; if (Left.IsUnterminatedLiteral) return true; - // FIXME: Breaking after newlines seems useful in general. Turn this into an - // option and recognize more cases like endl etc, and break independent of - // what comes after operator lessless. - if (Right.is(tok::lessless) && Right.Next && - Right.Next->is(tok::string_literal) && Left.is(tok::string_literal) && - Left.TokenText.ends_with("\\n\"")) { + if (Right.is(tok::lessless) && Right.Next && Left.is(tok::string_literal) && + Right.Next->is(tok::string_literal)) { return true; } if (Right.is(TT_RequiresClause)) { diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index b904e0e56d9eb3..a6eb18bb2b3227 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -489,18 +489,23 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { }; SmallVector LBraceStack; assert(Tok->is(tok::l_brace)); + do { - // Get next non-comment, non-preprocessor token. FormatToken *NextTok; do { NextTok = Tokens->getNextToken(); } while (NextTok->is(tok::comment)); - while (NextTok->is(tok::hash) && !Line->InMacroBody) { - NextTok = Tokens->getNextToken(); - do { - NextTok = Tokens->getNextToken(); - } while (NextTok->is(tok::comment) || - (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof))); + + if (!Line->InMacroBody) { + // Skip PPDirective lines and comments. + while (NextTok->is(tok::hash)) { + do { + NextTok = Tokens->getNextToken(); + } while (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof)); + + while (NextTok->is(tok::comment)) + NextTok = Tokens->getNextToken(); + } } switch (Tok->Tok.getKind()) { @@ -534,16 +539,6 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { if (Style.Language == FormatStyle::LK_Proto) { ProbablyBracedList = NextTok->isOneOf(tok::comma, tok::r_square); } else { - // Skip NextTok over preprocessor lines, otherwise we may not - // properly diagnose the block as a braced intializer - // if the comma separator appears after the pp directive. - while (NextTok->is(tok::hash)) { - ScopedMacroState MacroState(*Line, Tokens, NextTok); - do { - NextTok = Tokens->getNextToken(); - } while (NextTok->isNot(tok::eof)); - } - // Using OriginalColumn to distinguish between ObjC methods and // binary operators is a bit hacky. bool NextIsObjCMethod = NextTok->isOneOf(tok::plus, tok::minus) && @@ -602,6 +597,16 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { NextTok = Tokens->getNextToken(); ProbablyBracedList = NextTok->isNot(tok::l_square); } + + // Cpp macro definition body that is a nonempty braced list or block: + if (Style.isCpp() && Line->InMacroBody && PrevTok != FormatTok && + !FormatTok->Previous && NextTok->is(tok::eof) && + // A statement can end with only `;` (simple statement), a block + // closing brace (compound statement), or `:` (label statement). + // If PrevTok is a block opening brace, Tok ends an empty block. + !PrevTok->isOneOf(tok::semi, BK_Block, tok::colon)) { + ProbablyBracedList = true; + } } if (ProbablyBracedList) { Tok->setBlockKind(BK_BracedInit); @@ -631,6 +636,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { default: break; } + PrevTok = Tok; Tok = NextTok; } while (Tok->isNot(tok::eof) && !LBraceStack.empty()); @@ -2515,7 +2521,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) { parseChildBlock(); break; case tok::r_paren: - if (!MightBeStmtExpr && + if (!MightBeStmtExpr && !Line->InMacroBody && Style.RemoveParentheses > FormatStyle::RPS_Leave) { const auto *Prev = LeftParen->Previous; const auto *Next = Tokens->peekNextToken(); diff --git a/clang/lib/Headers/__stddef_null.h b/clang/lib/Headers/__stddef_null.h index 7336fdab389723..c10bd2d7d9887c 100644 --- a/clang/lib/Headers/__stddef_null.h +++ b/clang/lib/Headers/__stddef_null.h @@ -7,7 +7,7 @@ *===-----------------------------------------------------------------------=== */ -#if !defined(NULL) || !__has_feature(modules) +#if !defined(NULL) || !__building_module(_Builtin_stddef) /* linux/stddef.h will define NULL to 0. glibc (and other) headers then define * __need_NULL and rely on stddef.h to redefine NULL to the correct value again. diff --git a/clang/lib/Headers/__stddef_nullptr_t.h b/clang/lib/Headers/__stddef_nullptr_t.h index 183d394d56c1b7..7f3fbe6fe0d3a8 100644 --- a/clang/lib/Headers/__stddef_nullptr_t.h +++ b/clang/lib/Headers/__stddef_nullptr_t.h @@ -7,7 +7,12 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _NULLPTR_T +/* + * When -fbuiltin-headers-in-system-modules is set this is a non-modular header + * and needs to behave as if it was textual. + */ +#if !defined(_NULLPTR_T) || \ + (__has_feature(modules) && !__building_module(_Builtin_stddef)) #define _NULLPTR_T #ifdef __cplusplus diff --git a/clang/lib/Headers/__stddef_offsetof.h b/clang/lib/Headers/__stddef_offsetof.h index 3b347b3b92f62c..84172c6cd27352 100644 --- a/clang/lib/Headers/__stddef_offsetof.h +++ b/clang/lib/Headers/__stddef_offsetof.h @@ -7,6 +7,11 @@ *===-----------------------------------------------------------------------=== */ -#ifndef offsetof +/* + * When -fbuiltin-headers-in-system-modules is set this is a non-modular header + * and needs to behave as if it was textual. + */ +#if !defined(offsetof) || \ + (__has_feature(modules) && !__building_module(_Builtin_stddef)) #define offsetof(t, d) __builtin_offsetof(t, d) #endif diff --git a/clang/lib/Headers/__stddef_ptrdiff_t.h b/clang/lib/Headers/__stddef_ptrdiff_t.h index 3ea6d7d2852e1c..fd3c893c66c979 100644 --- a/clang/lib/Headers/__stddef_ptrdiff_t.h +++ b/clang/lib/Headers/__stddef_ptrdiff_t.h @@ -7,7 +7,12 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _PTRDIFF_T +/* + * When -fbuiltin-headers-in-system-modules is set this is a non-modular header + * and needs to behave as if it was textual. + */ +#if !defined(_PTRDIFF_T) || \ + (__has_feature(modules) && !__building_module(_Builtin_stddef)) #define _PTRDIFF_T typedef __PTRDIFF_TYPE__ ptrdiff_t; diff --git a/clang/lib/Headers/__stddef_rsize_t.h b/clang/lib/Headers/__stddef_rsize_t.h index b6428d0c12b62a..dd433d40d9733a 100644 --- a/clang/lib/Headers/__stddef_rsize_t.h +++ b/clang/lib/Headers/__stddef_rsize_t.h @@ -7,7 +7,12 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _RSIZE_T +/* + * When -fbuiltin-headers-in-system-modules is set this is a non-modular header + * and needs to behave as if it was textual. + */ +#if !defined(_RSIZE_T) || \ + (__has_feature(modules) && !__building_module(_Builtin_stddef)) #define _RSIZE_T typedef __SIZE_TYPE__ rsize_t; diff --git a/clang/lib/Headers/__stddef_size_t.h b/clang/lib/Headers/__stddef_size_t.h index e4a389510bcdbf..3dd7b1f3792949 100644 --- a/clang/lib/Headers/__stddef_size_t.h +++ b/clang/lib/Headers/__stddef_size_t.h @@ -7,7 +7,12 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _SIZE_T +/* + * When -fbuiltin-headers-in-system-modules is set this is a non-modular header + * and needs to behave as if it was textual. + */ +#if !defined(_SIZE_T) || \ + (__has_feature(modules) && !__building_module(_Builtin_stddef)) #define _SIZE_T typedef __SIZE_TYPE__ size_t; diff --git a/clang/lib/Headers/__stddef_unreachable.h b/clang/lib/Headers/__stddef_unreachable.h index 3e7fe01979662a..61df43e9732f8a 100644 --- a/clang/lib/Headers/__stddef_unreachable.h +++ b/clang/lib/Headers/__stddef_unreachable.h @@ -7,6 +7,15 @@ *===-----------------------------------------------------------------------=== */ -#ifndef unreachable +#ifndef __cplusplus + +/* + * When -fbuiltin-headers-in-system-modules is set this is a non-modular header + * and needs to behave as if it was textual. + */ +#if !defined(unreachable) || \ + (__has_feature(modules) && !__building_module(_Builtin_stddef)) #define unreachable() __builtin_unreachable() #endif + +#endif diff --git a/clang/lib/Headers/__stddef_wchar_t.h b/clang/lib/Headers/__stddef_wchar_t.h index 16a6186512c0c3..bd69f632254163 100644 --- a/clang/lib/Headers/__stddef_wchar_t.h +++ b/clang/lib/Headers/__stddef_wchar_t.h @@ -9,7 +9,12 @@ #if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED) -#ifndef _WCHAR_T +/* + * When -fbuiltin-headers-in-system-modules is set this is a non-modular header + * and needs to behave as if it was textual. + */ +#if !defined(_WCHAR_T) || \ + (__has_feature(modules) && !__building_module(_Builtin_stddef)) #define _WCHAR_T #ifdef _MSC_EXTENSIONS diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h index a613e5ca0e5ecd..f4218295919a0d 100644 --- a/clang/lib/Headers/larchintrin.h +++ b/clang/lib/Headers/larchintrin.h @@ -156,7 +156,7 @@ extern __inline unsigned char return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1); } -extern __inline unsigned char +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __iocsrrd_h(unsigned int _1) { return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1); diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap index a786689d391773..56a13f69bc0559 100644 --- a/clang/lib/Headers/module.modulemap +++ b/clang/lib/Headers/module.modulemap @@ -155,9 +155,9 @@ module _Builtin_intrinsics [system] [extern_c] { // Start -fbuiltin-headers-in-system-modules affected modules -// The following modules all ignore their top level headers -// when -fbuiltin-headers-in-system-modules is passed, and -// most of those headers join system modules when present. +// The following modules all ignore their headers when +// -fbuiltin-headers-in-system-modules is passed, and many of +// those headers join system modules when present. // e.g. if -fbuiltin-headers-in-system-modules is passed, then // float.h will not be in the _Builtin_float module (that module @@ -190,11 +190,6 @@ module _Builtin_stdalign [system] { export * } -// When -fbuiltin-headers-in-system-modules is passed, only -// the top level headers are removed, the implementation headers -// will always be in their submodules. That means when stdarg.h -// is included, it will still import this module and make the -// appropriate submodules visible. module _Builtin_stdarg [system] { textual header "stdarg.h" @@ -237,6 +232,8 @@ module _Builtin_stdbool [system] { module _Builtin_stddef [system] { textual header "stddef.h" + // __stddef_max_align_t.h is always in this module, even if + // -fbuiltin-headers-in-system-modules is passed. explicit module max_align_t { header "__stddef_max_align_t.h" export * @@ -283,9 +280,10 @@ module _Builtin_stddef [system] { } } -/* wint_t is provided by and not . It's here - * for compatibility, but must be explicitly requested. Therefore - * __stddef_wint_t.h is not part of _Builtin_stddef. */ +// wint_t is provided by and not . It's here +// for compatibility, but must be explicitly requested. Therefore +// __stddef_wint_t.h is not part of _Builtin_stddef. It is always in +// this module even if -fbuiltin-headers-in-system-modules is passed. module _Builtin_stddef_wint_t [system] { header "__stddef_wint_t.h" export * diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index afb2948f05ae5b..10c475f617d485 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -2498,9 +2498,12 @@ void ModuleMapParser::parseHeaderDecl(MMToken::TokenKind LeadingToken, } bool NeedsFramework = false; - // Don't add the top level headers to the builtin modules if the builtin headers - // belong to the system modules. - if (!Map.LangOpts.BuiltinHeadersInSystemModules || ActiveModule->isSubModule() || !isBuiltInModuleName(ActiveModule->Name)) + // Don't add headers to the builtin modules if the builtin headers belong to + // the system modules, with the exception of __stddef_max_align_t.h which + // always had its own module. + if (!Map.LangOpts.BuiltinHeadersInSystemModules || + !isBuiltInModuleName(ActiveModule->getTopLevelModuleName()) || + ActiveModule->fullModuleNameIs({"_Builtin_stddef", "max_align_t"})) Map.addUnresolvedHeader(ActiveModule, std::move(Header), NeedsFramework); if (NeedsFramework) diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 9f82a6d073e3ba..a980f4bcbae124 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -3288,7 +3288,7 @@ void Preprocessor::HandleIfdefDirective(Token &Result, return; } - emitMacroExpansionWarnings(MacroNameTok); + emitMacroExpansionWarnings(MacroNameTok, /*IsIfnDef=*/true); // Check to see if this is the last token on the #if[n]def line. CheckEndOfDirective(isIfndef ? "ifndef" : "ifdef"); diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp index 1feb0eb18d71e6..8f25c67ec9dfbe 100644 --- a/clang/lib/Lex/PPExpressions.cpp +++ b/clang/lib/Lex/PPExpressions.cpp @@ -133,7 +133,9 @@ static bool EvaluateDefined(PPValue &Result, Token &PeekTok, DefinedTracker &DT, Result.Val.setIsUnsigned(false); // Result is signed intmax_t. DT.IncludedUndefinedIds = !Macro; - PP.emitMacroExpansionWarnings(PeekTok); + PP.emitMacroExpansionWarnings( + PeekTok, + (II->getName() == "INFINITY" || II->getName() == "NAN") ? true : false); // If there is a macro, mark it used. if (Result.Val != 0 && ValueLive) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 7833d5a2ea20ee..09b7e1c62fbd7b 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -7545,47 +7545,43 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, } } - // If the callee uses AArch64 SME ZA state but the caller doesn't define - // any, then this is an error. - FunctionType::ArmStateValue ArmZAState = + FunctionType::ArmStateValue CalleeArmZAState = FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes); - if (ArmZAState != FunctionType::ARM_None) { + FunctionType::ArmStateValue CalleeArmZT0State = + FunctionType::getArmZT0State(ExtInfo.AArch64SMEAttributes); + if (CalleeArmZAState != FunctionType::ARM_None || + CalleeArmZT0State != FunctionType::ARM_None) { bool CallerHasZAState = false; + bool CallerHasZT0State = false; if (const auto *CallerFD = dyn_cast(CurContext)) { auto *Attr = CallerFD->getAttr(); if (Attr && Attr->isNewZA()) CallerHasZAState = true; - else if (const auto *FPT = - CallerFD->getType()->getAs()) - CallerHasZAState = FunctionType::getArmZAState( - FPT->getExtProtoInfo().AArch64SMEAttributes) != - FunctionType::ARM_None; - } - - if (!CallerHasZAState) - Diag(Loc, diag::err_sme_za_call_no_za_state); - } - - // If the callee uses AArch64 SME ZT0 state but the caller doesn't define - // any, then this is an error. - FunctionType::ArmStateValue ArmZT0State = - FunctionType::getArmZT0State(ExtInfo.AArch64SMEAttributes); - if (ArmZT0State != FunctionType::ARM_None) { - bool CallerHasZT0State = false; - if (const auto *CallerFD = dyn_cast(CurContext)) { - auto *Attr = CallerFD->getAttr(); if (Attr && Attr->isNewZT0()) CallerHasZT0State = true; - else if (const auto *FPT = - CallerFD->getType()->getAs()) - CallerHasZT0State = + if (const auto *FPT = CallerFD->getType()->getAs()) { + CallerHasZAState |= + FunctionType::getArmZAState( + FPT->getExtProtoInfo().AArch64SMEAttributes) != + FunctionType::ARM_None; + CallerHasZT0State |= FunctionType::getArmZT0State( FPT->getExtProtoInfo().AArch64SMEAttributes) != FunctionType::ARM_None; + } } - if (!CallerHasZT0State) + if (CalleeArmZAState != FunctionType::ARM_None && !CallerHasZAState) + Diag(Loc, diag::err_sme_za_call_no_za_state); + + if (CalleeArmZT0State != FunctionType::ARM_None && !CallerHasZT0State) Diag(Loc, diag::err_sme_zt0_call_no_zt0_state); + + if (CallerHasZAState && CalleeArmZAState == FunctionType::ARM_None && + CalleeArmZT0State != FunctionType::ARM_None) { + Diag(Loc, diag::err_sme_unimplemented_za_save_restore); + Diag(Loc, diag::note_sme_use_preserves_za); + } } } @@ -7643,9 +7639,8 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall, unsigned NumArgs = TheCall->getNumArgs(); Expr *ImplicitThis = nullptr; - if (IsMemberOperatorCall && !FDecl->isStatic() && - !FDecl->hasCXXExplicitFunctionObjectParameter()) { - // If this is a call to a non-static member operator, hide the first + if (IsMemberOperatorCall && !FDecl->hasCXXExplicitFunctionObjectParameter()) { + // If this is a call to a member operator, hide the first // argument from checkCall. // FIXME: Our choice of AST representation here is less than ideal. ImplicitThis = Args[0]; diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index acfc00f4125407..88fc846c89e42c 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -612,8 +612,12 @@ bool Sema::SetupConstraintScope( // If this is a member function, make sure we get the parameters that // reference the original primary template. - if (const auto *FromMemTempl = - PrimaryTemplate->getInstantiatedFromMemberTemplate()) { + // We walk up the instantiated template chain so that nested lambdas get + // handled properly. + for (FunctionTemplateDecl *FromMemTempl = + PrimaryTemplate->getInstantiatedFromMemberTemplate(); + FromMemTempl; + FromMemTempl = FromMemTempl->getInstantiatedFromMemberTemplate()) { if (addInstantiatedParametersToScope(FD, FromMemTempl->getTemplatedDecl(), Scope, MLTAL)) return true; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index f9bf1d14bdc4f6..f5bb3e0b42e26c 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -8396,28 +8396,40 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl, unsigned WarningDiag = diag::warn_decl_shadow; SourceLocation CaptureLoc; - if (isa(D) && isa(ShadowedDecl) && NewDC && - isa(NewDC)) { + if (isa(D) && NewDC && isa(NewDC)) { if (const auto *RD = dyn_cast(NewDC->getParent())) { if (RD->isLambda() && OldDC->Encloses(NewDC->getLexicalParent())) { - if (RD->getLambdaCaptureDefault() == LCD_None) { - // Try to avoid warnings for lambdas with an explicit capture list. + if (const auto *VD = dyn_cast(ShadowedDecl)) { const auto *LSI = cast(getCurFunction()); - // Warn only when the lambda captures the shadowed decl explicitly. - CaptureLoc = getCaptureLocation(LSI, cast(ShadowedDecl)); - if (CaptureLoc.isInvalid()) - WarningDiag = diag::warn_decl_shadow_uncaptured_local; - } else { - // Remember that this was shadowed so we can avoid the warning if the - // shadowed decl isn't captured and the warning settings allow it. + if (RD->getLambdaCaptureDefault() == LCD_None) { + // Try to avoid warnings for lambdas with an explicit capture + // list. Warn only when the lambda captures the shadowed decl + // explicitly. + CaptureLoc = getCaptureLocation(LSI, VD); + if (CaptureLoc.isInvalid()) + WarningDiag = diag::warn_decl_shadow_uncaptured_local; + } else { + // Remember that this was shadowed so we can avoid the warning if + // the shadowed decl isn't captured and the warning settings allow + // it. + cast(getCurFunction()) + ->ShadowingDecls.push_back({D, VD}); + return; + } + } + if (isa(ShadowedDecl)) { + // If lambda can capture this, then emit default shadowing warning, + // Otherwise it is not really a shadowing case since field is not + // available in lambda's body. + // At this point we don't know that lambda can capture this, so + // remember that this was shadowed and delay until we know. cast(getCurFunction()) - ->ShadowingDecls.push_back( - {cast(D), cast(ShadowedDecl)}); + ->ShadowingDecls.push_back({D, ShadowedDecl}); return; } } - - if (cast(ShadowedDecl)->hasLocalStorage()) { + if (const auto *VD = dyn_cast(ShadowedDecl); + VD && VD->hasLocalStorage()) { // A variable can't shadow a local variable in an enclosing scope, if // they are separated by a non-capturing declaration context. for (DeclContext *ParentDC = NewDC; @@ -8468,19 +8480,28 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl, /// when these variables are captured by the lambda. void Sema::DiagnoseShadowingLambdaDecls(const LambdaScopeInfo *LSI) { for (const auto &Shadow : LSI->ShadowingDecls) { - const VarDecl *ShadowedDecl = Shadow.ShadowedDecl; + const NamedDecl *ShadowedDecl = Shadow.ShadowedDecl; // Try to avoid the warning when the shadowed decl isn't captured. - SourceLocation CaptureLoc = getCaptureLocation(LSI, ShadowedDecl); const DeclContext *OldDC = ShadowedDecl->getDeclContext(); - Diag(Shadow.VD->getLocation(), CaptureLoc.isInvalid() - ? diag::warn_decl_shadow_uncaptured_local - : diag::warn_decl_shadow) - << Shadow.VD->getDeclName() - << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC; - if (!CaptureLoc.isInvalid()) - Diag(CaptureLoc, diag::note_var_explicitly_captured_here) - << Shadow.VD->getDeclName() << /*explicitly*/ 0; - Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration); + if (const auto *VD = dyn_cast(ShadowedDecl)) { + SourceLocation CaptureLoc = getCaptureLocation(LSI, VD); + Diag(Shadow.VD->getLocation(), + CaptureLoc.isInvalid() ? diag::warn_decl_shadow_uncaptured_local + : diag::warn_decl_shadow) + << Shadow.VD->getDeclName() + << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC; + if (CaptureLoc.isValid()) + Diag(CaptureLoc, diag::note_var_explicitly_captured_here) + << Shadow.VD->getDeclName() << /*explicitly*/ 0; + Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration); + } else if (isa(ShadowedDecl)) { + Diag(Shadow.VD->getLocation(), + LSI->isCXXThisCaptured() ? diag::warn_decl_shadow + : diag::warn_decl_shadow_uncaptured_local) + << Shadow.VD->getDeclName() + << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC; + Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration); + } } } @@ -12752,7 +12773,8 @@ namespace { } if (OpaqueValueExpr *OVE = dyn_cast(E)) { - HandleValue(OVE->getSourceExpr()); + if (Expr *SE = OVE->getSourceExpr()) + HandleValue(SE); return; } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 6413a48f809ac9..4cce0abc231505 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -11142,7 +11142,8 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS, if (VecType->getVectorKind() == VectorKind::SveFixedLengthData || VecType->getVectorKind() == VectorKind::SveFixedLengthPredicate) return true; - if (VecType->getVectorKind() == VectorKind::RVVFixedLengthData) { + if (VecType->getVectorKind() == VectorKind::RVVFixedLengthData || + VecType->getVectorKind() == VectorKind::RVVFixedLengthMask) { SVEorRVV = 1; return true; } @@ -11173,7 +11174,8 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS, SecondVecType->getVectorKind() == VectorKind::SveFixedLengthPredicate) return true; - if (SecondVecType->getVectorKind() == VectorKind::RVVFixedLengthData) { + if (SecondVecType->getVectorKind() == VectorKind::RVVFixedLengthData || + SecondVecType->getVectorKind() == VectorKind::RVVFixedLengthMask) { SVEorRVV = 1; return true; } @@ -14060,7 +14062,7 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, Expr::EvalResult EVResult; if (RHS.get()->EvaluateAsInt(EVResult, Context)) { llvm::APSInt Result = EVResult.Val.getInt(); - if ((getLangOpts().Bool && !RHS.get()->getType()->isBooleanType() && + if ((getLangOpts().CPlusPlus && !RHS.get()->getType()->isBooleanType() && !RHS.get()->getExprLoc().isMacroID()) || (Result != 0 && Result != 1)) { Diag(Loc, diag::warn_logical_instead_of_bitwise) @@ -18292,7 +18294,6 @@ void Sema::CheckUnusedVolatileAssignment(Expr *E) { } void Sema::MarkExpressionAsImmediateEscalating(Expr *E) { - assert(!FunctionScopes.empty() && "Expected a function scope"); assert(getLangOpts().CPlusPlus20 && ExprEvalContexts.back().InImmediateEscalatingFunctionContext && "Cannot mark an immediate escalating expression outside of an " @@ -18309,7 +18310,8 @@ void Sema::MarkExpressionAsImmediateEscalating(Expr *E) { } else { assert(false && "expected an immediately escalating expression"); } - getCurFunction()->FoundImmediateEscalatingExpression = true; + if (FunctionScopeInfo *FI = getCurFunction()) + FI->FoundImmediateEscalatingExpression = true; } ExprResult Sema::CheckForImmediateInvocation(ExprResult E, FunctionDecl *Decl) { diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 91e4cb7b68a24a..457fa377355a97 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -4200,7 +4200,7 @@ static OverloadingResult ResolveConstructorOverload( /// \param IsListInit Is this list-initialization? /// \param IsInitListCopy Is this non-list-initialization resulting from a /// list-initialization from {x} where x is the same -/// aggregate type as the entity? +/// type as the entity? static void TryConstructorInitialization(Sema &S, const InitializedEntity &Entity, const InitializationKind &Kind, @@ -4230,14 +4230,6 @@ static void TryConstructorInitialization(Sema &S, Entity.getKind() != InitializedEntity::EK_LambdaToBlockConversionBlockElement); - bool CopyElisionPossible = false; - auto ElideConstructor = [&] { - // Convert qualifications if necessary. - Sequence.AddQualificationConversionStep(DestType, VK_PRValue); - if (ILE) - Sequence.RewrapReferenceInitList(DestType, ILE); - }; - // C++17 [dcl.init]p17: // - If the initializer expression is a prvalue and the cv-unqualified // version of the source type is the same class as the class of the @@ -4250,17 +4242,11 @@ static void TryConstructorInitialization(Sema &S, if (S.getLangOpts().CPlusPlus17 && !RequireActualConstructor && UnwrappedArgs.size() == 1 && UnwrappedArgs[0]->isPRValue() && S.Context.hasSameUnqualifiedType(UnwrappedArgs[0]->getType(), DestType)) { - if (ILE && !DestType->isAggregateType()) { - // CWG2311: T{ prvalue_of_type_T } is not eligible for copy elision - // Make this an elision if this won't call an initializer-list - // constructor. (Always on an aggregate type or check constructors first.) - assert(!IsInitListCopy && - "IsInitListCopy only possible with aggregate types"); - CopyElisionPossible = true; - } else { - ElideConstructor(); - return; - } + // Convert qualifications if necessary. + Sequence.AddQualificationConversionStep(DestType, VK_PRValue); + if (ILE) + Sequence.RewrapReferenceInitList(DestType, ILE); + return; } const RecordType *DestRecordType = DestType->getAs(); @@ -4305,12 +4291,6 @@ static void TryConstructorInitialization(Sema &S, S, Kind.getLocation(), Args, CandidateSet, DestType, Ctors, Best, CopyInitialization, AllowExplicit, /*OnlyListConstructors=*/true, IsListInit, RequireActualConstructor); - - if (CopyElisionPossible && Result == OR_No_Viable_Function) { - // No initializer list candidate - ElideConstructor(); - return; - } } // C++11 [over.match.list]p1: @@ -4592,9 +4572,9 @@ static void TryListInitialization(Sema &S, return; } - // C++11 [dcl.init.list]p3, per DR1467 and DR2137: - // - If T is an aggregate class and the initializer list has a single element - // of type cv U, where U is T or a class derived from T, the object is + // C++11 [dcl.init.list]p3, per DR1467: + // - If T is a class type and the initializer list has a single element of + // type cv U, where U is T or a class derived from T, the object is // initialized from that element (by copy-initialization for // copy-list-initialization, or by direct-initialization for // direct-list-initialization). @@ -4605,7 +4585,7 @@ static void TryListInitialization(Sema &S, // - Otherwise, if T is an aggregate, [...] (continue below). if (S.getLangOpts().CPlusPlus11 && InitList->getNumInits() == 1 && !IsDesignatedInit) { - if (DestType->isRecordType() && DestType->isAggregateType()) { + if (DestType->isRecordType()) { QualType InitType = InitList->getInit(0)->getType(); if (S.Context.hasSameUnqualifiedType(InitType, DestType) || S.IsDerivedFrom(InitList->getBeginLoc(), InitType, DestType)) { diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 030878899b8122..b708272ebe7d87 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1568,37 +1568,19 @@ TryUserDefinedConversion(Sema &S, Expr *From, QualType ToType, // called for those cases. if (CXXConstructorDecl *Constructor = dyn_cast(ICS.UserDefined.ConversionFunction)) { - QualType FromType; - SourceLocation FromLoc; - // C++11 [over.ics.list]p6, per DR2137: - // C++17 [over.ics.list]p6: - // If C is not an initializer-list constructor and the initializer list - // has a single element of type cv U, where U is X or a class derived - // from X, the implicit conversion sequence has Exact Match rank if U is - // X, or Conversion rank if U is derived from X. - if (const auto *InitList = dyn_cast(From); - InitList && InitList->getNumInits() == 1 && - !S.isInitListConstructor(Constructor)) { - const Expr *SingleInit = InitList->getInit(0); - FromType = SingleInit->getType(); - FromLoc = SingleInit->getBeginLoc(); - } else { - FromType = From->getType(); - FromLoc = From->getBeginLoc(); - } - QualType FromCanon = - S.Context.getCanonicalType(FromType.getUnqualifiedType()); + QualType FromCanon + = S.Context.getCanonicalType(From->getType().getUnqualifiedType()); QualType ToCanon = S.Context.getCanonicalType(ToType).getUnqualifiedType(); if (Constructor->isCopyConstructor() && (FromCanon == ToCanon || - S.IsDerivedFrom(FromLoc, FromCanon, ToCanon))) { + S.IsDerivedFrom(From->getBeginLoc(), FromCanon, ToCanon))) { // Turn this into a "standard" conversion sequence, so that it // gets ranked with standard conversion sequences. DeclAccessPair Found = ICS.UserDefined.FoundConversionFunction; ICS.setStandard(); ICS.Standard.setAsIdentityConversion(); - ICS.Standard.setFromType(FromType); + ICS.Standard.setFromType(From->getType()); ICS.Standard.setAllToTypes(ToType); ICS.Standard.CopyConstructor = Constructor; ICS.Standard.FoundCopyConstructor = Found; @@ -5324,18 +5306,18 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType, IsDesignatedInit) return Result; - // Per DR1467 and DR2137: - // If the parameter type is an aggregate class X and the initializer list - // has a single element of type cv U, where U is X or a class derived from - // X, the implicit conversion sequence is the one required to convert the - // element to the parameter type. + // Per DR1467: + // If the parameter type is a class X and the initializer list has a single + // element of type cv U, where U is X or a class derived from X, the + // implicit conversion sequence is the one required to convert the element + // to the parameter type. // // Otherwise, if the parameter type is a character array [... ] // and the initializer list has a single element that is an // appropriately-typed string literal (8.5.2 [dcl.init.string]), the // implicit conversion sequence is the identity conversion. if (From->getNumInits() == 1 && !IsDesignatedInit) { - if (ToType->isRecordType() && ToType->isAggregateType()) { + if (ToType->isRecordType()) { QualType InitType = From->getInit(0)->getType(); if (S.Context.hasSameUnqualifiedType(InitType, ToType) || S.IsDerivedFrom(From->getBeginLoc(), InitType, ToType)) @@ -5682,10 +5664,15 @@ static ImplicitConversionSequence TryObjectArgumentInitialization( assert(FromType->isRecordType()); QualType ClassType = S.Context.getTypeDeclType(ActingContext); - // [class.dtor]p2: A destructor can be invoked for a const, volatile or - // const volatile object. + // C++98 [class.dtor]p2: + // A destructor can be invoked for a const, volatile or const volatile + // object. + // C++98 [over.match.funcs]p4: + // For static member functions, the implicit object parameter is considered + // to match any object (since if the function is selected, the object is + // discarded). Qualifiers Quals = Method->getMethodQualifiers(); - if (isa(Method)) { + if (isa(Method) || Method->isStatic()) { Quals.addConst(); Quals.addVolatile(); } @@ -14483,6 +14470,23 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, CurFPFeatureOverrides()); } + // If this is the .* operator, which is not overloadable, just + // create a built-in binary operator. + if (Opc == BO_PtrMemD) { + auto CheckPlaceholder = [&](Expr *&Arg) { + ExprResult Res = CheckPlaceholderExpr(Arg); + if (Res.isUsable()) + Arg = Res.get(); + return !Res.isUsable(); + }; + + // CreateBuiltinBinOp() doesn't like it if we tell it to create a '.*' + // expression that contains placeholders (in either the LHS or RHS). + if (CheckPlaceholder(Args[0]) || CheckPlaceholder(Args[1])) + return ExprError(); + return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]); + } + // Always do placeholder-like conversions on the RHS. if (checkPlaceholderForOverload(*this, Args[1])) return ExprError(); @@ -14502,11 +14506,6 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, if (Opc == BO_Assign && !Args[0]->getType()->isOverloadableType()) return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]); - // If this is the .* operator, which is not overloadable, just - // create a built-in binary operator. - if (Opc == BO_PtrMemD) - return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]); - // Build the overload set. OverloadCandidateSet CandidateSet(OpLoc, OverloadCandidateSet::CSK_Operator, OverloadCandidateSet::OperatorRewriteInfo( @@ -15079,7 +15078,7 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc, CXXMethodDecl *Method = cast(FnDecl); SmallVector MethodArgs; - // Handle 'this' parameter if the selected function is not static. + // Initialize the object parameter. if (Method->isExplicitObjectMemberFunction()) { ExprResult Res = InitializeExplicitObjectArgument(*this, Args[0], Method); @@ -15087,7 +15086,7 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc, return ExprError(); Args[0] = Res.get(); ArgExpr = Args; - } else if (Method->isInstance()) { + } else { ExprResult Arg0 = PerformImplicitObjectArgumentInitialization( Args[0], /*Qualifier=*/nullptr, Best->FoundDecl, Method); if (Arg0.isInvalid()) @@ -15115,15 +15114,9 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc, ExprValueKind VK = Expr::getValueKindForType(ResultTy); ResultTy = ResultTy.getNonLValueExprType(Context); - CallExpr *TheCall; - if (Method->isInstance()) - TheCall = CXXOperatorCallExpr::Create( - Context, OO_Subscript, FnExpr.get(), MethodArgs, ResultTy, VK, - RLoc, CurFPFeatureOverrides()); - else - TheCall = - CallExpr::Create(Context, FnExpr.get(), MethodArgs, ResultTy, VK, - RLoc, CurFPFeatureOverrides()); + CallExpr *TheCall = CXXOperatorCallExpr::Create( + Context, OO_Subscript, FnExpr.get(), MethodArgs, ResultTy, VK, RLoc, + CurFPFeatureOverrides()); if (CheckCallReturnType(FnDecl->getReturnType(), LLoc, TheCall, FnDecl)) return ExprError(); @@ -15751,15 +15744,13 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj, bool IsError = false; - // Initialize the implicit object parameter if needed. - // Since C++23, this could also be a call to a static call operator - // which we emit as a regular CallExpr. + // Initialize the object parameter. llvm::SmallVector NewArgs; if (Method->isExplicitObjectMemberFunction()) { // FIXME: we should do that during the definition of the lambda when we can. DiagnoseInvalidExplicitObjectParameterInLambda(Method); PrepareExplicitObjectArgument(*this, Method, Obj, Args, NewArgs); - } else if (Method->isInstance()) { + } else { ExprResult ObjRes = PerformImplicitObjectArgumentInitialization( Object.get(), /*Qualifier=*/nullptr, Best->FoundDecl, Method); if (ObjRes.isInvalid()) @@ -15793,14 +15784,9 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj, ExprValueKind VK = Expr::getValueKindForType(ResultTy); ResultTy = ResultTy.getNonLValueExprType(Context); - CallExpr *TheCall; - if (Method->isInstance()) - TheCall = CXXOperatorCallExpr::Create(Context, OO_Call, NewFn.get(), - MethodArgs, ResultTy, VK, RParenLoc, - CurFPFeatureOverrides()); - else - TheCall = CallExpr::Create(Context, NewFn.get(), MethodArgs, ResultTy, VK, - RParenLoc, CurFPFeatureOverrides()); + CallExpr *TheCall = CXXOperatorCallExpr::Create( + Context, OO_Call, NewFn.get(), MethodArgs, ResultTy, VK, RParenLoc, + CurFPFeatureOverrides()); if (CheckCallReturnType(Method->getReturnType(), LParenLoc, TheCall, Method)) return true; diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 9bfa71dc8bcf1d..b619f5d729e86b 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1830,7 +1830,27 @@ static TemplateParameterList *GetTemplateParameterList(TemplateDecl *TD) { // Make sure we get the template parameter list from the most // recent declaration, since that is the only one that is guaranteed to // have all the default template argument information. - return cast(TD->getMostRecentDecl())->getTemplateParameters(); + Decl *D = TD->getMostRecentDecl(); + // C++11 [temp.param]p12: + // A default template argument shall not be specified in a friend class + // template declaration. + // + // Skip past friend *declarations* because they are not supposed to contain + // default template arguments. Moreover, these declarations may introduce + // template parameters living in different template depths than the + // corresponding template parameters in TD, causing unmatched constraint + // substitution. + // + // FIXME: Diagnose such cases within a class template: + // template + // struct S { + // template friend struct C; + // }; + // template struct S; + while (D->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None && + D->getPreviousDecl()) + D = D->getPreviousDecl(); + return cast(D)->getTemplateParameters(); } DeclResult Sema::CheckClassTemplate( @@ -7412,9 +7432,9 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, if (ArgResult.isInvalid()) return ExprError(); - // Prior to C++20, enforce restrictions on possible template argument - // values. - if (!getLangOpts().CPlusPlus20 && Value.isLValue()) { + if (Value.isLValue()) { + APValue::LValueBase Base = Value.getLValueBase(); + auto *VD = const_cast(Base.dyn_cast()); // For a non-type template-parameter of pointer or reference type, // the value of the constant expression shall not refer to assert(ParamType->isPointerType() || ParamType->isReferenceType() || @@ -7423,8 +7443,6 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // -- a string literal // -- the result of a typeid expression, or // -- a predefined __func__ variable - APValue::LValueBase Base = Value.getLValueBase(); - auto *VD = const_cast(Base.dyn_cast()); if (Base && (!VD || isa(VD))) { @@ -7432,24 +7450,30 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, << Arg->getSourceRange(); return ExprError(); } - // -- a subobject [until C++20] - if (Value.hasLValuePath() && Value.getLValuePath().size() == 1 && - VD && VD->getType()->isArrayType() && + + if (Value.hasLValuePath() && Value.getLValuePath().size() == 1 && VD && + VD->getType()->isArrayType() && Value.getLValuePath()[0].getAsArrayIndex() == 0 && !Value.isLValueOnePastTheEnd() && ParamType->isPointerType()) { - // Per defect report (no number yet): - // ... other than a pointer to the first element of a complete array - // object. - } else if (!Value.hasLValuePath() || Value.getLValuePath().size() || - Value.isLValueOnePastTheEnd()) { - Diag(StartLoc, diag::err_non_type_template_arg_subobject) - << Value.getAsString(Context, ParamType); - return ExprError(); + SugaredConverted = TemplateArgument(VD, ParamType); + CanonicalConverted = TemplateArgument( + cast(VD->getCanonicalDecl()), CanonParamType); + return ArgResult.get(); + } + + // -- a subobject [until C++20] + if (!getLangOpts().CPlusPlus20) { + if (!Value.hasLValuePath() || Value.getLValuePath().size() || + Value.isLValueOnePastTheEnd()) { + Diag(StartLoc, diag::err_non_type_template_arg_subobject) + << Value.getAsString(Context, ParamType); + return ExprError(); + } + assert((VD || !ParamType->isReferenceType()) && + "null reference should not be a constant expression"); + assert((!VD || !ParamType->isNullPtrType()) && + "non-null value of type nullptr_t?"); } - assert((VD || !ParamType->isReferenceType()) && - "null reference should not be a constant expression"); - assert((!VD || !ParamType->isNullPtrType()) && - "non-null value of type nullptr_t?"); } if (Value.isAddrLabelDiff()) diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 9cb6c0a4ef248e..92086d7277fd1f 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -8646,21 +8646,30 @@ static void HandleRISCVRVVVectorBitsTypeAttr(QualType &CurType, ASTContext::BuiltinVectorTypeInfo Info = S.Context.getBuiltinVectorTypeInfo(CurType->castAs()); - unsigned EltSize = S.Context.getTypeSize(Info.ElementType); unsigned MinElts = Info.EC.getKnownMinValue(); + VectorKind VecKind = VectorKind::RVVFixedLengthData; + unsigned ExpectedSize = VScale->first * MinElts; + QualType EltType = CurType->getRVVEltType(S.Context); + unsigned EltSize = S.Context.getTypeSize(EltType); + unsigned NumElts; + if (Info.ElementType == S.Context.BoolTy) { + NumElts = VecSize / S.Context.getCharWidth(); + VecKind = VectorKind::RVVFixedLengthMask; + } else { + ExpectedSize *= EltSize; + NumElts = VecSize / EltSize; + } + // The attribute vector size must match -mrvv-vector-bits. - unsigned ExpectedSize = VScale->first * MinElts * EltSize; - if (VecSize != ExpectedSize) { + if (ExpectedSize % 8 != 0 || VecSize != ExpectedSize) { S.Diag(Attr.getLoc(), diag::err_attribute_bad_rvv_vector_size) << VecSize << ExpectedSize; Attr.setInvalid(); return; } - VectorKind VecKind = VectorKind::RVVFixedLengthData; - VecSize /= EltSize; - CurType = S.Context.getVectorType(Info.ElementType, VecSize, VecKind); + CurType = S.Context.getVectorType(EltType, NumElts, VecKind); } /// Handle OpenCL Access Qualifier Attribute. diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index fecd94e875f671..490b8cb10a4841 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -9743,6 +9743,9 @@ void ASTReader::finishPendingActions() { if (!FD->isLateTemplateParsed() && !NonConstDefn->isLateTemplateParsed() && + // We only perform ODR checks for decls not in the explicit + // global module fragment. + !FD->shouldSkipCheckingODR() && FD->getODRHash() != NonConstDefn->getODRHash()) { if (!isa(FD)) { PendingFunctionOdrMergeFailures[FD].push_back(NonConstDefn); diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index a149d82153037f..110f55f8c0f49a 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -800,12 +800,15 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) { BitsUnpacker EnumDeclBits(Record.readInt()); ED->setNumPositiveBits(EnumDeclBits.getNextBits(/*Width=*/8)); ED->setNumNegativeBits(EnumDeclBits.getNextBits(/*Width=*/8)); + bool ShouldSkipCheckingODR = EnumDeclBits.getNextBit(); ED->setScoped(EnumDeclBits.getNextBit()); ED->setScopedUsingClassTag(EnumDeclBits.getNextBit()); ED->setFixed(EnumDeclBits.getNextBit()); - ED->setHasODRHash(true); - ED->ODRHash = Record.readInt(); + if (!ShouldSkipCheckingODR) { + ED->setHasODRHash(true); + ED->ODRHash = Record.readInt(); + } // If this is a definition subject to the ODR, and we already have a // definition, merge this one into it. @@ -827,7 +830,10 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) { Reader.MergedDeclContexts.insert(std::make_pair(ED, OldDef)); ED->demoteThisDefinitionToDeclaration(); Reader.mergeDefinitionVisibility(OldDef, ED); - if (OldDef->getODRHash() != ED->getODRHash()) + // We don't want to check the ODR hash value for declarations from global + // module fragment. + if (!ED->shouldSkipCheckingODR() && + OldDef->getODRHash() != ED->getODRHash()) Reader.PendingEnumOdrMergeFailures[OldDef].push_back(ED); } else { OldDef = ED; @@ -866,6 +872,9 @@ ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) { void ASTDeclReader::VisitRecordDecl(RecordDecl *RD) { VisitRecordDeclImpl(RD); + // We should only reach here if we're in C/Objective-C. There is no + // global module fragment. + assert(!RD->shouldSkipCheckingODR()); RD->setODRHash(Record.readInt()); // Maintain the invariant of a redeclaration chain containing only @@ -1065,6 +1074,7 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) { FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3)); FD->setStorageClass((StorageClass)FunctionDeclBits.getNextBits(/*Width=*/3)); + bool ShouldSkipCheckingODR = FunctionDeclBits.getNextBit(); FD->setInlineSpecified(FunctionDeclBits.getNextBit()); FD->setImplicitlyInline(FunctionDeclBits.getNextBit()); FD->setHasSkippedBody(FunctionDeclBits.getNextBit()); @@ -1094,8 +1104,10 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) { if (FD->isExplicitlyDefaulted()) FD->setDefaultLoc(readSourceLocation()); - FD->ODRHash = Record.readInt(); - FD->setHasODRHash(true); + if (!ShouldSkipCheckingODR) { + FD->ODRHash = Record.readInt(); + FD->setHasODRHash(true); + } if (FD->isDefaulted()) { if (unsigned NumLookups = Record.readInt()) { @@ -1963,6 +1975,8 @@ void ASTDeclReader::ReadCXXDefinitionData( BitsUnpacker CXXRecordDeclBits = Record.readInt(); + bool ShouldSkipCheckingODR = CXXRecordDeclBits.getNextBit(); + #define FIELD(Name, Width, Merge) \ if (!CXXRecordDeclBits.canGetNextNBits(Width)) \ CXXRecordDeclBits.updateValue(Record.readInt()); \ @@ -1971,9 +1985,12 @@ void ASTDeclReader::ReadCXXDefinitionData( #include "clang/AST/CXXRecordDeclDefinitionBits.def" #undef FIELD - // Note: the caller has deserialized the IsLambda bit already. - Data.ODRHash = Record.readInt(); - Data.HasODRHash = true; + // We only perform ODR checks for decls not in GMF. + if (!ShouldSkipCheckingODR) { + // Note: the caller has deserialized the IsLambda bit already. + Data.ODRHash = Record.readInt(); + Data.HasODRHash = true; + } if (Record.readInt()) { Reader.DefinitionSource[D] = @@ -2134,6 +2151,10 @@ void ASTDeclReader::MergeDefinitionData( } } + // We don't want to check ODR for decls in the global module fragment. + if (MergeDD.Definition->shouldSkipCheckingODR()) + return; + if (D->getODRHash() != MergeDD.ODRHash) { DetectedOdrViolation = true; } @@ -3498,11 +3519,14 @@ ASTDeclReader::FindExistingResult ASTDeclReader::findExisting(NamedDecl *D) { // If this declaration is from a merged context, make a note that we need to // check that the canonical definition of that context contains the decl. // + // Note that we don't perform ODR checks for decls from the global module + // fragment. + // // FIXME: We should do something similar if we merge two definitions of the // same template specialization into the same CXXRecordDecl. auto MergedDCIt = Reader.MergedDeclContexts.find(D->getLexicalDeclContext()); if (MergedDCIt != Reader.MergedDeclContexts.end() && - MergedDCIt->second == D->getDeclContext()) + !D->shouldSkipCheckingODR() && MergedDCIt->second == D->getDeclContext()) Reader.PendingOdrMergeChecks.push_back(D); return FindExistingResult(Reader, D, /*Existing=*/nullptr, diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 03bddfe0f5047d..378a1f86bd5342 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6010,6 +6010,9 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) { BitsPacker DefinitionBits; + bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR(); + DefinitionBits.addBit(ShouldSkipCheckingODR); + #define FIELD(Name, Width, Merge) \ if (!DefinitionBits.canWriteNextNBits(Width)) { \ Record->push_back(DefinitionBits); \ @@ -6022,8 +6025,11 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) { Record->push_back(DefinitionBits); - // getODRHash will compute the ODRHash if it has not been previously computed. - Record->push_back(D->getODRHash()); + // We only perform ODR checks for decls not in GMF. + if (!ShouldSkipCheckingODR) + // getODRHash will compute the ODRHash if it has not been previously + // computed. + Record->push_back(D->getODRHash()); bool ModulesDebugInfo = Writer->Context->getLangOpts().ModulesDebugInfo && !D->isDependentType(); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index bb1f51786d2813..42583c09f009e0 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -488,12 +488,16 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) { BitsPacker EnumDeclBits; EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8); EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8); + bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR(); + EnumDeclBits.addBit(ShouldSkipCheckingODR); EnumDeclBits.addBit(D->isScoped()); EnumDeclBits.addBit(D->isScopedUsingClassTag()); EnumDeclBits.addBit(D->isFixed()); Record.push_back(EnumDeclBits); - Record.push_back(D->getODRHash()); + // We only perform ODR checks for decls not in GMF. + if (!ShouldSkipCheckingODR) + Record.push_back(D->getODRHash()); if (MemberSpecializationInfo *MemberInfo = D->getMemberSpecializationInfo()) { Record.AddDeclRef(MemberInfo->getInstantiatedFrom()); @@ -510,7 +514,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) { !D->isTopLevelDeclInObjCContainer() && !CXXRecordDecl::classofKind(D->getKind()) && !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() && - !needsAnonymousDeclarationNumber(D) && + !needsAnonymousDeclarationNumber(D) && !D->shouldSkipCheckingODR() && D->getDeclName().getNameKind() == DeclarationName::Identifier) AbbrevToUse = Writer.getDeclEnumAbbrev(); @@ -676,6 +680,8 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { // FIXME: stable encoding FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3); FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3); + bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR(); + FunctionDeclBits.addBit(ShouldSkipCheckingODR); FunctionDeclBits.addBit(D->isInlineSpecified()); FunctionDeclBits.addBit(D->isInlined()); FunctionDeclBits.addBit(D->hasSkippedBody()); @@ -701,7 +707,9 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { if (D->isExplicitlyDefaulted()) Record.AddSourceLocation(D->getDefaultLoc()); - Record.push_back(D->getODRHash()); + // We only perform ODR checks for decls not in GMF. + if (!ShouldSkipCheckingODR) + Record.push_back(D->getODRHash()); if (D->isDefaulted()) { if (auto *FDI = D->getDefaultedFunctionInfo()) { @@ -1506,7 +1514,8 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) { D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() && !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() && D->getDeclName().getNameKind() == DeclarationName::Identifier && - !D->hasExtInfo() && !D->isExplicitlyDefaulted()) { + !D->shouldSkipCheckingODR() && !D->hasExtInfo() && + !D->isExplicitlyDefaulted()) { if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate || D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate || D->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization || @@ -2132,12 +2141,13 @@ getFunctionDeclAbbrev(serialization::DeclCode Code) { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 11)); // IDNS Abv->Add(BitCodeAbbrevOp( BitCodeAbbrevOp::Fixed, - 27)); // Packed Function Bits: StorageClass, Inline, InlineSpecified, + 28)); // Packed Function Bits: StorageClass, Inline, InlineSpecified, // VirtualAsWritten, Pure, HasInheritedProto, HasWrittenProto, // Deleted, Trivial, TrivialForCall, Defaulted, ExplicitlyDefaulted, // IsIneligibleOrNotSelected, ImplicitReturnZero, Constexpr, // UsesSEHTry, SkippedBody, MultiVersion, LateParsed, - // FriendConstraintRefersToEnclosingTemplate, Linkage + // FriendConstraintRefersToEnclosingTemplate, Linkage, + // ShouldSkipCheckingODR Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LocEnd Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // ODRHash // This Array slurps the rest of the record. Fortunately we want to encode @@ -2264,7 +2274,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // AddTypeRef Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // IntegerType Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // getPromotionType - Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 19)); // Enum Decl Bits + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 20)); // Enum Decl Bits Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));// ODRHash Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // InstantiatedMembEnum // DC diff --git a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp index 4edb671753bf45..6362c82b009d72 100644 --- a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp @@ -216,21 +216,17 @@ std::vector taint::getTaintedSymbolsImpl(ProgramStateRef State, std::vector TaintedSymbols; if (!Reg) return TaintedSymbols; - // Element region (array element) is tainted if either the base or the offset - // are tainted. + + // Element region (array element) is tainted if the offset is tainted. if (const ElementRegion *ER = dyn_cast(Reg)) { std::vector TaintedIndex = getTaintedSymbolsImpl(State, ER->getIndex(), K, returnFirstOnly); llvm::append_range(TaintedSymbols, TaintedIndex); if (returnFirstOnly && !TaintedSymbols.empty()) return TaintedSymbols; // return early if needed - std::vector TaintedSuperRegion = - getTaintedSymbolsImpl(State, ER->getSuperRegion(), K, returnFirstOnly); - llvm::append_range(TaintedSymbols, TaintedSuperRegion); - if (returnFirstOnly && !TaintedSymbols.empty()) - return TaintedSymbols; // return early if needed } + // Symbolic region is tainted if the corresponding symbol is tainted. if (const SymbolicRegion *SR = dyn_cast(Reg)) { std::vector TaintedRegions = getTaintedSymbolsImpl(State, SR->getSymbol(), K, returnFirstOnly); @@ -239,6 +235,8 @@ std::vector taint::getTaintedSymbolsImpl(ProgramStateRef State, return TaintedSymbols; // return early if needed } + // Any subregion (including Element and Symbolic regions) is tainted if its + // super-region is tainted. if (const SubRegion *ER = dyn_cast(Reg)) { std::vector TaintedSubRegions = getTaintedSymbolsImpl(State, ER->getSuperRegion(), K, returnFirstOnly); @@ -318,4 +316,4 @@ std::vector taint::getTaintedSymbolsImpl(ProgramStateRef State, } } return TaintedSymbols; -} \ No newline at end of file +} diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp index e5dd907c660d8e..b2947f590c4ec1 100644 --- a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp @@ -205,8 +205,12 @@ void InvalidPtrChecker::postPreviousReturnInvalidatingCall( CE, LCtx, CE->getType(), C.blockCount()); State = State->BindExpr(CE, LCtx, RetVal); + const auto *SymRegOfRetVal = + dyn_cast_or_null(RetVal.getAsRegion()); + if (!SymRegOfRetVal) + return; + // Remember to this region. - const auto *SymRegOfRetVal = cast(RetVal.getAsRegion()); const MemRegion *MR = SymRegOfRetVal->getBaseRegion(); State = State->set(FD, MR); diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp index 0ac1d91b79beb5..bc14aea27f6736 100644 --- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp +++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp @@ -1409,7 +1409,7 @@ CallEventManager::getSimpleCall(const CallExpr *CE, ProgramStateRef State, if (const auto *OpCE = dyn_cast(CE)) { const FunctionDecl *DirectCallee = OpCE->getDirectCallee(); if (const auto *MD = dyn_cast(DirectCallee)) - if (MD->isInstance()) + if (MD->isImplicitObjectMemberFunction()) return create(OpCE, State, LCtx, ElemRef); } else if (CE->getCallee()->getType()->isBlockPointerType()) { diff --git a/clang/lib/StaticAnalyzer/Core/Environment.cpp b/clang/lib/StaticAnalyzer/Core/Environment.cpp index 4f989ed59bee38..427f51109853bd 100644 --- a/clang/lib/StaticAnalyzer/Core/Environment.cpp +++ b/clang/lib/StaticAnalyzer/Core/Environment.cpp @@ -40,8 +40,11 @@ static const Expr *ignoreTransparentExprs(const Expr *E) { switch (E->getStmtClass()) { case Stmt::OpaqueValueExprClass: - E = cast(E)->getSourceExpr(); - break; + if (const Expr *SE = cast(E)->getSourceExpr()) { + E = SE; + break; + } + return E; case Stmt::ExprWithCleanupsClass: E = cast(E)->getSubExpr(); break; @@ -98,7 +101,6 @@ SVal Environment::getSVal(const EnvironmentEntry &Entry, case Stmt::CXXBindTemporaryExprClass: case Stmt::ExprWithCleanupsClass: case Stmt::GenericSelectionExprClass: - case Stmt::OpaqueValueExprClass: case Stmt::ConstantExprClass: case Stmt::ParenExprClass: case Stmt::SubstNonTypeTemplateParmExprClass: diff --git a/clang/test/AST/ast-crash-doc-function-template.cpp b/clang/test/AST/ast-crash-doc-function-template.cpp new file mode 100644 index 00000000000000..d48eb0dbe02f01 --- /dev/null +++ b/clang/test/AST/ast-crash-doc-function-template.cpp @@ -0,0 +1,30 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t + +// RUN: %clang_cc1 -x c++ -Wdocumentation -fsyntax-only -ast-dump-all %t/t.cpp + +//--- t.h +/// MyClass in the header file +class MyClass { +public: + template + void Foo() const; + + /// Bar + void Bar() const; +}; + +//--- t.cpp +#include "t.h" + +/// MyClass::Bar: Foo() is implicitly instantiated and called here. +void MyClass::Bar() const { + Foo(); +} + +/// MyClass::Foo +template +void MyClass::Foo() const { +} + +// CHECK: TranslationUnitDecl diff --git a/clang/test/AST/ast-dump-override-final.cpp b/clang/test/AST/ast-dump-override-final.cpp new file mode 100644 index 00000000000000..c1cee6b01565f6 --- /dev/null +++ b/clang/test/AST/ast-dump-override-final.cpp @@ -0,0 +1,20 @@ +// This file contain tests to check if override and final are dumped in the +// correct positions. + +// RUN: %clang_cc1 -ast-print -x c++ %s -o - | FileCheck %s + +// CHECK: class A { +class A { + // CHECK-NEXT: virtual void f(); + virtual void f(); + + // CHECK-NEXT: virtual void g() final; + virtual void g() final; +} AA; + +// CHECK: class B : public A { +class B : public A { + // CHECK-NEXT: virtual void f() override { + virtual void f() override { + }; +} B; diff --git a/clang/test/AST/ast-dump-static-operators.cpp b/clang/test/AST/ast-dump-static-operators.cpp new file mode 100644 index 00000000000000..e8454bdac02f7b --- /dev/null +++ b/clang/test/AST/ast-dump-static-operators.cpp @@ -0,0 +1,55 @@ +// RUN: %clang_cc1 -std=c++23 %s -ast-dump -triple x86_64-unknown-unknown -o - | FileCheck -strict-whitespace %s + +struct Functor { + static int operator()(int x, int y) { + return x + y; + } + static int operator[](int x, int y) { + return x + y; + } +}; + +Functor& get_functor() { + static Functor functor; + return functor; +} + +void call_static_operators() { + Functor functor; + + int z1 = functor(1, 2); + // CHECK: CXXOperatorCallExpr {{.*}} 'int' '()' + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int (*)(int, int)' + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int (int, int)' lvalue CXXMethod {{.*}} 'operator()' 'int (int, int)' + // CHECK-NEXT: |-DeclRefExpr {{.*}} 'Functor' lvalue Var {{.*}} 'functor' 'Functor' + // CHECK-NEXT: |-IntegerLiteral {{.*}} 'int' 1 + // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 2 + + int z2 = functor[1, 2]; + // CHECK: CXXOperatorCallExpr {{.*}} 'int' '[]' + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int (*)(int, int)' + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int (int, int)' lvalue CXXMethod {{.*}} 'operator[]' 'int (int, int)' + // CHECK-NEXT: |-DeclRefExpr {{.*}} 'Functor' lvalue Var {{.*}} 'functor' 'Functor' + // CHECK-NEXT: |-IntegerLiteral {{.*}} 'int' 1 + // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 2 + + int z3 = get_functor()(1, 2); + // CHECK: CXXOperatorCallExpr {{.*}} 'int' '()' + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int (*)(int, int)' + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int (int, int)' lvalue CXXMethod {{.*}} 'operator()' 'int (int, int)' + // CHECK-NEXT: |-CallExpr {{.*}} 'Functor' lvalue + // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} 'Functor &(*)()' + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'Functor &()' lvalue Function {{.*}} 'get_functor' 'Functor &()' + // CHECK-NEXT: |-IntegerLiteral {{.*}} 'int' 1 + // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 2 + + int z4 = get_functor()[1, 2]; + // CHECK: CXXOperatorCallExpr {{.*}} 'int' '[]' + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int (*)(int, int)' + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int (int, int)' lvalue CXXMethod {{.*}} 'operator[]' 'int (int, int)' + // CHECK-NEXT: |-CallExpr {{.*}} 'Functor' lvalue + // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} 'Functor &(*)()' + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'Functor &()' lvalue Function {{.*}} 'get_functor' 'Functor &()' + // CHECK-NEXT: |-IntegerLiteral {{.*}} 'int' 1 + // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 2 +} diff --git a/clang/test/Analysis/cxx2b-deducing-this.cpp b/clang/test/Analysis/cxx2b-deducing-this.cpp index d22a897097bec0..2ec9e96bf0f84f 100644 --- a/clang/test/Analysis/cxx2b-deducing-this.cpp +++ b/clang/test/Analysis/cxx2b-deducing-this.cpp @@ -60,3 +60,14 @@ void top() { s.c(); s.c(11); } + + +struct S2 { + bool operator==(this auto, S2) { + return true; + } +}; +void use_deducing_this() { + int result = S2{} == S2{}; // no-crash + clang_analyzer_dump(result); // expected-warning {{1 S32b}} +} diff --git a/clang/test/Analysis/invalid-ptr-checker.cpp b/clang/test/Analysis/invalid-ptr-checker.cpp new file mode 100644 index 00000000000000..58bb45e0fb8421 --- /dev/null +++ b/clang/test/Analysis/invalid-ptr-checker.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.cert.env.InvalidPtr -verify %s + +// expected-no-diagnostics + +namespace other { +int strerror(int errnum); // custom strerror +void no_crash_on_custom_strerror() { + (void)strerror(0); // no-crash +} +} // namespace other diff --git a/clang/test/Analysis/templates.cpp b/clang/test/Analysis/templates.cpp index 061c19fe7e0445..6da1821b70f26f 100644 --- a/clang/test/Analysis/templates.cpp +++ b/clang/test/Analysis/templates.cpp @@ -68,3 +68,16 @@ namespace rdar13954714 { // force instantiation template void blockWithStatic(); } + +namespace structural_value_crash { + constexpr char abc[] = "abc"; + + template + void use_template_param() { + const char *p = in; + } + + void force_instantiate() { + use_template_param(); + } +} diff --git a/clang/test/CXX/class.derived/class.member.lookup/p11.cpp b/clang/test/CXX/class.derived/class.member.lookup/p11.cpp index e0899b227e69bd..a42febaca3f041 100644 --- a/clang/test/CXX/class.derived/class.member.lookup/p11.cpp +++ b/clang/test/CXX/class.derived/class.member.lookup/p11.cpp @@ -23,3 +23,25 @@ struct D: I1, I2, B2 { int D::* mpD = &D::i; // expected-error {{non-static member 'i' found in multiple base-class subobjects of type 'B1'}} } }; + +namespace GH80435 { +struct A { + void *data; // expected-note {{member found by ambiguous name lookup}} +}; + +class B { + void *data; // expected-note {{member found by ambiguous name lookup}} +}; + +struct C : A, B {}; + +decltype(C().data) x; // expected-error {{member 'data' found in multiple base classes of different types}} + +struct D { // expected-note {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'C' to 'const D' for 1st argument}} + // expected-note@-1{{candidate constructor (the implicit move constructor) not viable: no known conversion from 'C' to 'D' for 1st argument}} + template + D(Container); // expected-note {{candidate template ignored: substitution failure [with Container = C]: member 'data' found in multiple base classes of different types}} +}; + +D y(C{}); // expected-error {{no matching constructor for initialization of 'D'}} +} diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/dr14xx.cpp index 4c29d03a6e117a..d262f6f9dcab79 100644 --- a/clang/test/CXX/drs/dr14xx.cpp +++ b/clang/test/CXX/drs/dr14xx.cpp @@ -488,6 +488,16 @@ namespace dr1467 { // dr1467: 3.7 c++11 } } // nonaggregate + namespace SelfInitIsNotListInit { + struct S { + S(); + explicit S(S &); + S(const S &); + }; + S s1; + S s2 = {s1}; // ok, not list-initialization so we pick the non-explicit constructor + } + struct NestedInit { int a, b, c; }; NestedInit ni[1] = {{NestedInit{1, 2, 3}}}; diff --git a/clang/test/CXX/drs/dr21xx.cpp b/clang/test/CXX/drs/dr21xx.cpp index 87040246aa5cd4..a7e50df3f374be 100644 --- a/clang/test/CXX/drs/dr21xx.cpp +++ b/clang/test/CXX/drs/dr21xx.cpp @@ -11,16 +11,6 @@ // cxx98-error@-1 {{variadic macros are a C99 feature}} #endif -namespace std { - __extension__ typedef __SIZE_TYPE__ size_t; - - template struct initializer_list { - const E *p; size_t n; - initializer_list(const E *p, size_t n); - initializer_list(); - }; -} - namespace dr2100 { // dr2100: 12 template struct X {}; template struct A { @@ -142,41 +132,6 @@ namespace dr2126 { // dr2126: 12 #endif } -namespace dr2137 { // dr2137: 18 -#if __cplusplus >= 201103L - struct Q { - Q(); - Q(Q&&); - Q(std::initializer_list) = delete; // #dr2137-Qcons - }; - - Q x = Q { Q() }; - // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}} - // since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}} - - int f(Q); // #dr2137-f - int y = f({ Q() }); - // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}} - // since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}} - // since-cxx11-note@#dr2137-f {{passing argument to parameter here}} - - struct U { - U(); - U(const U&); - }; - - struct Derived : U { - Derived(); - Derived(const Derived&); - } d; - - int g(Derived); - int g(U(&&)[1]) = delete; - - int z = g({ d }); -#endif -} - namespace dr2140 { // dr2140: 9 #if __cplusplus >= 201103L union U { int a; decltype(nullptr) b; }; diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/dr23xx.cpp index d8556998315c77..03077ae9239a45 100644 --- a/clang/test/CXX/drs/dr23xx.cpp +++ b/clang/test/CXX/drs/dr23xx.cpp @@ -6,16 +6,6 @@ // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s -namespace std { - __extension__ typedef __SIZE_TYPE__ size_t; - - template struct initializer_list { - const E *p; size_t n; - initializer_list(const E *p, size_t n); - initializer_list(); - }; -} - #if __cplusplus >= 201103L namespace dr2303 { // dr2303: 12 template @@ -57,81 +47,6 @@ void g() { } //namespace dr2303 #endif -namespace dr2311 { // dr2311: 18 open -#if __cplusplus >= 201707L -template -void test() { - // Ensure none of these try to call a move constructor. - T a = T{T(0)}; - T b{T(0)}; - auto c{T(0)}; - T d = {T(0)}; - auto e = {T(0)}; -#if __cplusplus >= 202302L - auto f = auto{T(0)}; -#endif - void(*fn)(T); - fn({T(0)}); -} - -struct NonMovable { - NonMovable(int); - NonMovable(NonMovable&&) = delete; -}; -struct NonMovableNonApplicableIList { - NonMovableNonApplicableIList(int); - NonMovableNonApplicableIList(NonMovableNonApplicableIList&&) = delete; - NonMovableNonApplicableIList(std::initializer_list); -}; -struct ExplicitMovable { - ExplicitMovable(int); - explicit ExplicitMovable(ExplicitMovable&&); -}; -struct ExplicitNonMovable { - ExplicitNonMovable(int); - explicit ExplicitNonMovable(ExplicitNonMovable&&) = delete; -}; -struct ExplicitNonMovableNonApplicableIList { - ExplicitNonMovableNonApplicableIList(int); - explicit ExplicitNonMovableNonApplicableIList(ExplicitNonMovableNonApplicableIList&&) = delete; - ExplicitNonMovableNonApplicableIList(std::initializer_list); -}; -struct CopyOnly { - CopyOnly(int); - CopyOnly(const CopyOnly&); - CopyOnly(CopyOnly&&) = delete; -}; -struct ExplicitCopyOnly { - ExplicitCopyOnly(int); - explicit ExplicitCopyOnly(const ExplicitCopyOnly&); - explicit ExplicitCopyOnly(ExplicitCopyOnly&&) = delete; -}; - -template void test(); -template void test(); -template void test(); -template void test(); -template void test(); -template void test(); -template void test(); - -struct any { - template - any(T&&); -}; - -template -struct X { - X(); - X(T) = delete; // #dr2311-X -}; - -X> x{ X>() }; -// since-cxx17-error@-1 {{call to deleted constructor of 'X>'}} -// since-cxx17-note@#dr2311-X {{'X' has been explicitly marked deleted here}} -#endif -} - // dr2331: na // dr2335 is in dr2335.cxx diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la32.c b/clang/test/CodeGen/LoongArch/intrinsic-la32.c index 93d54f511a9cd2..eb3f8cbe7ac4cc 100644 --- a/clang/test/CodeGen/LoongArch/intrinsic-la32.c +++ b/clang/test/CodeGen/LoongArch/intrinsic-la32.c @@ -169,8 +169,8 @@ unsigned int cpucfg(unsigned int a) { // LA32-LABEL: @rdtime( // LA32-NEXT: entry: -// LA32-NEXT: [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2 -// LA32-NEXT: [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3 +// LA32-NEXT: [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]] +// LA32-NEXT: [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]] // LA32-NEXT: ret void // void rdtime() { @@ -201,13 +201,28 @@ void loongarch_movgr2fcsr(int a) { __builtin_loongarch_movgr2fcsr(1, a); } -// CHECK-LABEL: @cacop_w( -// CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024) -// CHECK-NEXT: tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024) -// CHECK-NEXT: ret void +// LA32-LABEL: @cacop_w( +// LA32-NEXT: entry: +// LA32-NEXT: tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024) +// LA32-NEXT: tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024) +// LA32-NEXT: ret void // void cacop_w(unsigned long int a) { __cacop_w(1, a, 1024); __builtin_loongarch_cacop_w(1, a, 1024); } + +// LA32-LABEL: @iocsrrd_h_result( +// LA32-NEXT: entry: +// LA32-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]]) +// LA32-NEXT: [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16 +// LA32-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]]) +// LA32-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// LA32-NEXT: [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]] +// LA32-NEXT: ret i16 [[CONV3]] +// +unsigned short iocsrrd_h_result(unsigned int a) { + unsigned short b = __iocsrrd_h(a); + unsigned short c = __builtin_loongarch_iocsrrd_h(a); + return b+c; +} diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64.c b/clang/test/CodeGen/LoongArch/intrinsic-la64.c index a740882eef5411..50ec358f546ec0 100644 --- a/clang/test/CodeGen/LoongArch/intrinsic-la64.c +++ b/clang/test/CodeGen/LoongArch/intrinsic-la64.c @@ -387,7 +387,7 @@ unsigned int cpucfg(unsigned int a) { // CHECK-LABEL: @rdtime_d( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2 +// CHECK-NEXT: [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]] // CHECK-NEXT: ret void // void rdtime_d() { @@ -396,8 +396,8 @@ void rdtime_d() { // CHECK-LABEL: @rdtime( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3 -// CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !4 +// CHECK-NEXT: [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META4:![0-9]+]] // CHECK-NEXT: ret void // void rdtime() { @@ -427,3 +427,18 @@ void loongarch_movgr2fcsr(int a) { __movgr2fcsr(1, a); __builtin_loongarch_movgr2fcsr(1, a); } + +// CHECK-LABEL: @iocsrrd_h_result( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]]) +// CHECK-NEXT: [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]] +// CHECK-NEXT: ret i16 [[CONV3]] +// +unsigned short iocsrrd_h_result(unsigned int a) { + unsigned short b = __iocsrrd_h(a); + unsigned short c = __builtin_loongarch_iocsrrd_h(a); + return b+c; +} diff --git a/clang/test/CodeGen/Mips/inline-asm-constraints.c b/clang/test/CodeGen/Mips/inline-asm-constraints.c new file mode 100644 index 00000000000000..88afe8735083b4 --- /dev/null +++ b/clang/test/CodeGen/Mips/inline-asm-constraints.c @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -emit-llvm -triple mips -target-feature +soft-float %s -o - | FileCheck %s --check-prefix=SOFT_FLOAT + +// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(float %1) +void read_float(float *p) { + __asm__("" ::"r"(*p)); +} + +// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(double %1) +void read_double(double *p) { + __asm__("" :: "r"(*p)); +} diff --git a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c index 7d3362e84e7588..f216eaf735b4a8 100644 --- a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c +++ b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c @@ -39,7 +39,7 @@ __attribute__((target("cpu=sifive-u54"))) void testAttrCpuOnly() {} // CHECK: attributes #0 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zifencei,-relax,-zbb,-zfa" } // CHECK: attributes #1 = { {{.*}}"target-cpu"="rocket-rv64" "target-features"="+64bit,+a,+d,+f,+m,+save-restore,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zbb,-zfa" "tune-cpu"="generic-rv64" } // CHECK: attributes #2 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zbb,+zifencei,-relax,-zfa" } -// CHECK: attributes #3 = { {{.*}}"target-features"="+64bit,+a,+d,+experimental-zicond,+f,+m,+save-restore,+v,+zbb,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zfa" } +// CHECK: attributes #3 = { {{.*}}"target-features"="+64bit,+a,+d,+f,+m,+save-restore,+v,+zbb,+zicond,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zfa" } // Make sure we append negative features if we override the arch // CHECK: attributes #4 = { {{.*}}"target-features"="+64bit,+a,+c,+d,+f,+m,+save-restore,+zbb,+zicsr,+zifencei,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" } // CHECK: attributes #5 = { {{.*}}"target-features"="+64bit,+m,+save-restore,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" } diff --git a/clang/test/CodeGen/RISCV/tls-dialect.c b/clang/test/CodeGen/RISCV/tls-dialect.c new file mode 100644 index 00000000000000..e624a8b3fe4e67 --- /dev/null +++ b/clang/test/CodeGen/RISCV/tls-dialect.c @@ -0,0 +1,14 @@ +// REQUIRES: riscv-registered-target +/// cc1 -enable-tlsdesc (due to -mtls-dialect=desc) enables TLSDESC. +// RUN: %clang_cc1 -triple riscv64 -S -mrelocation-model pic -pic-level 1 -enable-tlsdesc %s -o - | FileCheck %s --check-prefix=DESC +// RUN: %clang_cc1 -triple riscv64 -S -mrelocation-model pic -pic-level 1 %s -o - | FileCheck %s --check-prefix=NODESC + +__thread int x; + +// DESC: %tlsdesc_hi +// DESC-NOT: %tls_gd_pcrel_hi +// NODESC: %tls_gd_pcrel_hi +// NODESC-NOT: %tlsdesc_hi +int use() { + return x; +} diff --git a/clang/test/CodeGen/aapcs-align.cpp b/clang/test/CodeGen/aapcs-align.cpp index 2886a32974b066..4f393d9e6b7f32 100644 --- a/clang/test/CodeGen/aapcs-align.cpp +++ b/clang/test/CodeGen/aapcs-align.cpp @@ -134,8 +134,8 @@ void g6() { f6m(1, 2, 3, 4, 5, s); } // CHECK: define{{.*}} void @g6 -// CHECK: call void @f6(i32 noundef 1, [4 x i32] [i32 6, i32 7, i32 0, i32 0]) -// CHECK: call void @f6m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [4 x i32] [i32 6, i32 7, i32 0, i32 0]) +// CHECK: call void @f6(i32 noundef 1, [4 x i32] [i32 6, i32 7, i32 0, i32 undef]) +// CHECK: call void @f6m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [4 x i32] [i32 6, i32 7, i32 0, i32 undef]) // CHECK: declare void @f6(i32 noundef, [4 x i32]) // CHECK: declare void @f6m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [4 x i32]) } diff --git a/clang/test/CodeGen/aapcs64-align.cpp b/clang/test/CodeGen/aapcs64-align.cpp index 759413cbc4b56f..de231f2123b975 100644 --- a/clang/test/CodeGen/aapcs64-align.cpp +++ b/clang/test/CodeGen/aapcs64-align.cpp @@ -75,8 +75,8 @@ void g4() { f4m(1, 2, 3, 4, 5, s); } // CHECK: define{{.*}} void @g4() -// CHECK: call void @f4(i32 noundef 1, [2 x i64] %{{.*}}) -// CHECK: void @f4m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] %{{.*}}) +// CHECK: call void @f4(i32 noundef 1, [2 x i64] [i64 30064771078, i64 0]) +// CHECK: void @f4m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] [i64 30064771078, i64 0]) // CHECK: declare void @f4(i32 noundef, [2 x i64]) // CHECK: declare void @f4m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64]) @@ -95,8 +95,8 @@ void f5m(int, int, int, int, int, P16); f5m(1, 2, 3, 4, 5, s); } // CHECK: define{{.*}} void @g5() -// CHECK: call void @f5(i32 noundef 1, [2 x i64] %{{.*}}) -// CHECK: void @f5m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] %{{.*}}) +// CHECK: call void @f5(i32 noundef 1, [2 x i64] [i64 30064771078, i64 0]) +// CHECK: void @f5m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] [i64 30064771078, i64 0]) // CHECK: declare void @f5(i32 noundef, [2 x i64]) // CHECK: declare void @f5m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64]) diff --git a/clang/test/CodeGen/aarch64-inline-asm.c b/clang/test/CodeGen/aarch64-inline-asm.c index 75e9a8c46b8769..8ddee560b11da4 100644 --- a/clang/test/CodeGen/aarch64-inline-asm.c +++ b/clang/test/CodeGen/aarch64-inline-asm.c @@ -95,3 +95,11 @@ void test_reduced_gpr_constraints(int var32, long var64) { // CHECK: [[ARG2:%.+]] = load i64, ptr // CHECK: call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 [[ARG2]]) } + +void test_sme_constraints(){ + asm("movt zt0[3, mul vl], z0" : : : "za"); +// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"() + + asm("movt zt0[3, mul vl], z0" : : : "zt0"); +// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"() +} \ No newline at end of file diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c index 695e0afa3d0de7..a333d85818d2fa 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c index a3c3d8bf13db6f..7617dcef7ea973 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c index 2c2f100ac7f8cc..5fa4c35ed770ff 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c index 0502073097d541..b26e32e5ff8332 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c index 60feebced32d2e..02d4d034befb78 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index b0c1dd904284f5..c2c89aee03b5e6 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c index 3dcc2c70d3cfcd..e036cb45feff36 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c index 06a6a19ca3f7a6..84338597cdb30b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c index 69641df8a80fb1..7b1a8b0a0201ff 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c index d726855c9743fb..3d2a4e4d2b38e0 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c index ed1e70a3a469f5..9e44d1c9253466 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c index f93f3e5138b24d..d8e4b853308d0f 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c index b4bc041a21d588..467cf9fd092a04 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 2ad7e364257235..e58021bf8bf456 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c index a00a48ceafccb7..483e813275028f 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c index 7f56941108828a..e62f7b92821e42 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c index cfc2ee0f77be69..2cc99d4fb88d5f 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c index 720254c8421203..1ff7a7fedf1bf3 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c index a5dec263bff2c2..257cb595250181 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c @@ -1,15 +1,15 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \ -// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \ -// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \ -// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \ -// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \ -// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c index a8d881b1e2ed3d..79a11c2ec153e4 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c index f03e998131df51..2b2b2e5c0f411d 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c index b90358112e47f1..8d1e358176c30f 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c index 7c210fbe6923e9..70c31a4a87e7e9 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c @@ -2,9 +2,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c index d7ef75ce01dd70..5bc9c9088517e8 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c @@ -2,9 +2,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c index f65c0ef61f8187..82c004e3105a43 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c @@ -2,9 +2,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c index cbab1cc8e81bd4..e8706f9576915f 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c @@ -2,9 +2,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c index f7f16281ff4061..99feafbd682acb 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c @@ -2,9 +2,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c index 3fedfdc3308932..0f0c33e48bd97b 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c @@ -2,9 +2,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c index 4b9b4363ec6296..a4e2616784efa7 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c @@ -1,9 +1,9 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c index 5e499573304c81..3e554212cb70be 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c index 26c175c76532b7..a438fd395219bb 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c @@ -1,9 +1,9 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c index 8c5e7eb4019919..b0cbdc748dc80c 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c index b552bbb66a259e..5cc0e0e1d36e1a 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c index cd87ee00999500..5fd4b04056526e 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c @@ -1,9 +1,9 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c index 6847eb99af2df3..b86cb19c01e308 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c @@ -2,10 +2,10 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c index c7da703ddb2718..7af8c589994fbb 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c index 6f20b37f87897d..5937a288dd8468 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c @@ -2,12 +2,12 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c index 781b699c882bb7..f54c09d5ef2c37 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c @@ -2,12 +2,12 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c index 2b24f65efc2663..2fb5d3bea27c23 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c @@ -2,11 +2,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c index 32f9bc3ab88086..eee927acc22e34 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c index 8735016ae728cf..6308d6c596f164 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c index 32eb9a4b40c405..c2ecbf93bfaa78 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c index 6e9d5cf4492f54..784c24c8e7cb68 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c index f4dadef8fa5789..6349cec771199e 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c index ff7f0c501f5507..3d56948e25f73b 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c index 3f13b3a0db73d9..4cc1f3af32ec05 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c index b8408ea31ed0b4..cc356600ab5333 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c index 33733356f3078f..069bf13ff8d281 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sve -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c index 02da18264da0a3..1a3a84a73dbad1 100644 --- a/clang/test/CodeGen/aarch64-targetattr.c +++ b/clang/test/CodeGen/aarch64-targetattr.c @@ -97,19 +97,19 @@ void minusarch() {} // CHECK: attributes #0 = { {{.*}} "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #1 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #2 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } -// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } +// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } // CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm" } // CHECK: attributes #5 = { {{.*}} "tune-cpu"="cortex-a710" } // CHECK: attributes #6 = { {{.*}} "target-cpu"="generic" } // CHECK: attributes #7 = { {{.*}} "tune-cpu"="generic" } // CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs" "tune-cpu"="cortex-a710" } // CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" "tune-cpu"="cortex-a710" } -// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2" } -// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,-sve" } +// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2" } +// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,-sve" } // CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } // CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-sve2" } // CHECK: attributes #14 = { {{.*}} "target-features"="+fullfp16" } -// CHECK: attributes #15 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } -// CHECK: attributes #16 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } +// CHECK: attributes #15 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } +// CHECK: attributes #16 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } // CHECK: attributes #17 = { {{.*}} "target-features"="-neon" } // CHECK: attributes #18 = { {{.*}} "target-features"="-v9.3a" } diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp index e8309888dcfe21..85472645acb3b3 100644 --- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp +++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp @@ -201,3 +201,18 @@ S11 f11() { S11 x; return func11(x); } + +// GH86384 +// Pass and return object with template constructor (pass directly, +// return indirectly). +// CHECK: define dso_local void @"?f12@@YA?AUS12@@XZ"(ptr dead_on_unwind inreg noalias writable sret(%struct.S12) align 4 {{.*}}) +// CHECK: call void @"?func12@@YA?AUS12@@U1@@Z"(ptr dead_on_unwind inreg writable sret(%struct.S12) align 4 {{.*}}, i64 {{.*}}) +struct S12 { + template S12(T*) {} + int x; +}; +S12 func12(S12 x); +S12 f12() { + S12 x((int*)0); + return func12(x); +} diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.c b/clang/test/CodeGen/attr-counted-by-pr88931.c new file mode 100644 index 00000000000000..520ebd09973284 --- /dev/null +++ b/clang/test/CodeGen/attr-counted-by-pr88931.c @@ -0,0 +1,40 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wno-missing-declarations -emit-llvm -o - %s | FileCheck %s + +struct foo { + int x,y,z; + struct bar { + int count; + int array[] __attribute__((counted_by(count))); + }; +}; + +void init(void * __attribute__((pass_dynamic_object_size(0)))); + +// CHECK-LABEL: define dso_local void @test1( +// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_BAR:%.*]], ptr [[P]], i64 0, i32 1 +// CHECK-NEXT: tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef -1) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret void +// +void test1(struct bar *p) { + init(p->array); +} + +struct mux { + int count; + int array[] __attribute__((counted_by(count))); +}; + +struct bux { struct mux x; }; + +// CHECK-LABEL: define dso_local void @test2( +// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @init(ptr noundef [[P]], i64 noundef -1) #[[ATTR2]] +// CHECK-NEXT: ret void +// +void test2(struct bux *p) { + init(p); +} diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.cpp b/clang/test/CodeGen/attr-counted-by-pr88931.cpp new file mode 100644 index 00000000000000..2a8cc1d07e50d9 --- /dev/null +++ b/clang/test/CodeGen/attr-counted-by-pr88931.cpp @@ -0,0 +1,21 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -emit-llvm -o - %s | FileCheck %s + +struct foo { + struct bar { + int array[]; + bar(); + }; +}; + +void init(void * __attribute__((pass_dynamic_object_size(0)))); + +// CHECK-LABEL: define dso_local void @_ZN3foo3barC1Ev( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(1) [[THIS:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret void +// +foo::bar::bar() { + init(array); +} diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c index 74d5457e398b91..118e6b889e2672 100644 --- a/clang/test/CodeGen/attr-counted-by.c +++ b/clang/test/CodeGen/attr-counted-by.c @@ -1288,16 +1288,10 @@ int test14(int idx) { // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test15( // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: -// NO-SANITIZE-WITH-ATTR-NEXT: [[FOO:%.*]] = alloca [[STRUCT_ANON_8:%.*]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR12]] -// NO-SANITIZE-WITH-ATTR-NEXT: store i32 1, ptr [[FOO]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4 -// NO-SANITIZE-WITH-ATTR-NEXT: store i32 2, ptr [[TMP0]], align 4 // NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64 -// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANON_8]], ptr [[FOO]], i64 0, i32 2, i64 [[IDXPROM]] -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] -// NO-SANITIZE-WITH-ATTR-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR12]] -// NO-SANITIZE-WITH-ATTR-NEXT: ret i32 [[TMP1]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] +// NO-SANITIZE-WITH-ATTR-NEXT: ret i32 [[TMP0]] // // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15( // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -1315,16 +1309,10 @@ int test14(int idx) { // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15( // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] { // NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: -// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[FOO:%.*]] = alloca [[STRUCT_ANON_8:%.*]], align 4 -// NO-SANITIZE-WITHOUT-ATTR-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR9]] -// NO-SANITIZE-WITHOUT-ATTR-NEXT: store i32 1, ptr [[FOO]], align 4 -// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4 -// NO-SANITIZE-WITHOUT-ATTR-NEXT: store i32 2, ptr [[TMP0]], align 4 // NO-SANITIZE-WITHOUT-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64 -// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANON_8]], ptr [[FOO]], i64 0, i32 2, i64 [[IDXPROM]] -// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] -// NO-SANITIZE-WITHOUT-ATTR-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR9]] -// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i32 [[TMP1]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i32 [[TMP0]] // int test15(int idx) { struct { diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c index 886af083f1c009..b591249bbef1bc 100644 --- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c +++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c @@ -18,8 +18,29 @@ typedef __rvv_uint64m1_t vuint64m1_t; typedef __rvv_float32m1_t vfloat32m1_t; typedef __rvv_float64m1_t vfloat64m1_t; +typedef __rvv_bool1_t vbool1_t; +typedef __rvv_bool2_t vbool2_t; +typedef __rvv_bool4_t vbool4_t; +typedef __rvv_bool8_t vbool8_t; +typedef __rvv_bool16_t vbool16_t; +typedef __rvv_bool32_t vbool32_t; +typedef __rvv_bool64_t vbool64_t; + typedef vint64m1_t fixed_int64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 2))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 4))); +typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 8))); +#if __riscv_v_fixed_vlen >= 128 +typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 16))); +#endif +#if __riscv_v_fixed_vlen >= 256 +typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 32))); +#endif +#if __riscv_v_fixed_vlen >= 512 +typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 64))); +#endif #define DEFINE_STRUCT(ty) \ struct struct_##ty { \ @@ -28,6 +49,19 @@ typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__ri DEFINE_STRUCT(int64m1) DEFINE_STRUCT(float64m1) +DEFINE_STRUCT(bool1) +DEFINE_STRUCT(bool2) +DEFINE_STRUCT(bool4) +DEFINE_STRUCT(bool8) +#if __riscv_v_fixed_vlen >= 128 +DEFINE_STRUCT(bool16) +#endif +#if __riscv_v_fixed_vlen >= 256 +DEFINE_STRUCT(bool32) +#endif +#if __riscv_v_fixed_vlen >= 512 +DEFINE_STRUCT(bool64) +#endif //===----------------------------------------------------------------------===// // int64 @@ -136,3 +170,69 @@ vfloat64m1_t read_float64m1(struct struct_float64m1 *s) { void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) { s->y[0] = x; } + +//===----------------------------------------------------------------------===// +// bool +//===----------------------------------------------------------------------===// + +// CHECK-64-LABEL: @read_bool1( +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8 +// CHECK-64-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1 +// CHECK-64-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: ret [[TMP1]] +// +// CHECK-128-LABEL: @read_bool1( +// CHECK-128-NEXT: entry: +// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <16 x i8>, align 16 +// CHECK-128-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1 +// CHECK-128-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]] +// CHECK-128-NEXT: store <16 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]] +// CHECK-128-NEXT: ret [[TMP1]] +// +// CHECK-256-LABEL: @read_bool1( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32 +// CHECK-256-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1 +// CHECK-256-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]] +// CHECK-256-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]] +// CHECK-256-NEXT: ret [[TMP1]] +// +vbool1_t read_bool1(struct struct_bool1 *s) { + return s->y[0]; +} + +// CHECK-64-LABEL: @write_bool1( +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 8 +// CHECK-64-NEXT: store [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]] +// CHECK-64-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1 +// CHECK-64-NEXT: store <8 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: ret void +// +// CHECK-128-LABEL: @write_bool1( +// CHECK-128-NEXT: entry: +// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 16 +// CHECK-128-NEXT: store [[X:%.*]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA7:![0-9]+]] +// CHECK-128-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]] +// CHECK-128-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1 +// CHECK-128-NEXT: store <16 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]] +// CHECK-128-NEXT: ret void +// +// CHECK-256-LABEL: @write_bool1( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 8 +// CHECK-256-NEXT: store [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]] +// CHECK-256-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1 +// CHECK-256-NEXT: store <32 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: ret void +// +void write_bool1(struct struct_bool1 *s, vbool1_t x) { + s->y[0] = x; +} diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c index 70e1aefe7aaffb..888abe1a7bc3fb 100644 --- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c +++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c @@ -7,6 +7,8 @@ typedef vint32m1_t fixed_int32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4))); //===----------------------------------------------------------------------===// // Test caller/callee with VLST <-> VLAT @@ -66,6 +68,30 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2) return __riscv_vfadd(op1, op2, __riscv_v_fixed_vlen/64); } +// CHECK-LABEL: @call_bool1_ff( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SAVED_VALUE4:%.*]] = alloca , align 8 +// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 8 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv64i1.i64( [[OP1_COERCE:%.*]], [[OP2_COERCE:%.*]], i64 256) +// CHECK-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA4:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA8:![0-9]+]] +// CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: ret [[TMP2]] +// +fixed_bool1_t call_bool1_ff(fixed_bool1_t op1, fixed_bool1_t op2) { + return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen); +} + +// CHECK-LABEL: @call_bool4_ff( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.riscv.vmand.nxv16i1.i64( [[TMP0:%.*]], [[TMP1:%.*]], i64 64) +// CHECK-NEXT: ret [[TMP2]] +// +fixed_bool4_t call_bool4_ff(fixed_bool4_t op1, fixed_bool4_t op2) { + return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 4); +} + //===----------------------------------------------------------------------===// // fixed, scalable //===----------------------------------------------------------------------===// @@ -88,6 +114,30 @@ fixed_float64m1_t call_float64_fs(fixed_float64m1_t op1, vfloat64m1_t op2) { return __riscv_vfadd(op1, op2, __riscv_v_fixed_vlen/64); } +// CHECK-LABEL: @call_bool1_fs( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SAVED_VALUE2:%.*]] = alloca , align 8 +// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 8 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv64i1.i64( [[OP1_COERCE:%.*]], [[OP2:%.*]], i64 256) +// CHECK-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA4]] +// CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: ret [[TMP2]] +// +fixed_bool1_t call_bool1_fs(fixed_bool1_t op1, vbool1_t op2) { + return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen); +} + +// CHECK-LABEL: @call_bool4_fs( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.riscv.vmand.nxv16i1.i64( [[TMP0:%.*]], [[OP2:%.*]], i64 64) +// CHECK-NEXT: ret [[TMP1]] +// +fixed_bool4_t call_bool4_fs(fixed_bool4_t op1, vbool4_t op2) { + return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 4); +} + //===----------------------------------------------------------------------===// // scalable, scalable //===----------------------------------------------------------------------===// @@ -109,3 +159,27 @@ fixed_int32m1_t call_int32_ss(vint32m1_t op1, vint32m1_t op2) { fixed_float64m1_t call_float64_ss(vfloat64m1_t op1, vfloat64m1_t op2) { return __riscv_vfadd(op1, op2, __riscv_v_fixed_vlen/64); } + +// CHECK-LABEL: @call_bool1_ss( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 8 +// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 8 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv64i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 256) +// CHECK-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: ret [[TMP2]] +// +fixed_bool1_t call_bool1_ss(vbool1_t op1, vbool1_t op2) { + return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen); +} + +// CHECK-LABEL: @call_bool4_ss( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv16i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 64) +// CHECK-NEXT: ret [[TMP0]] +// +fixed_bool4_t call_bool4_ss(vbool4_t op1, vbool4_t op2) { + return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 4); +} diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c index 93e9a4eee96eb8..fe278174bf6817 100644 --- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c +++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c @@ -16,6 +16,10 @@ typedef __rvv_uint64m1_t vuint64m1_t; typedef __rvv_float32m1_t vfloat32m1_t; typedef __rvv_float64m1_t vfloat64m1_t; +typedef __rvv_bool1_t vbool1_t; +typedef __rvv_bool4_t vbool4_t; +typedef __rvv_bool32_t vbool32_t; + typedef vint64m1_t fixed_int64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); @@ -23,6 +27,10 @@ typedef vint32m1_t fixed_int32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_ typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); typedef int32_t gnu_int32m1_t __attribute__((vector_size(__riscv_v_fixed_vlen / 8))); +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4))); +typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32))); + // CHECK-LABEL: @to_vint32m1_t( // CHECK-NEXT: entry: // CHECK-NEXT: ret [[TYPE_COERCE:%.*]] @@ -55,9 +63,69 @@ fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) { return type; } +// CHECK-LABEL: @from_vbool1_t( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 8 +// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 8 +// CHECK-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8:![0-9]+]] +// CHECK-NEXT: store <32 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: ret [[TMP1]] +// +fixed_bool1_t from_vbool1_t(vbool1_t type) { + return type; +} + +// CHECK-LABEL: @to_vbool1_t( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret [[TYPE_COERCE:%.*]] +// +vbool1_t to_vbool1_t(fixed_bool1_t type) { + return type; +} + +// CHECK-LABEL: @from_vbool4_t( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret [[TYPE:%.*]] +// +fixed_bool4_t from_vbool4_t(vbool4_t type) { + return type; +} + +// CHECK-LABEL: @to_vbool4_t( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret [[TMP0:%.*]] +// +vbool4_t to_vbool4_t(fixed_bool4_t type) { + return type; +} + +// CHECK-LABEL: @from_vbool32_t( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 +// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 +// CHECK-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8]] +// CHECK-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 +// CHECK-NEXT: ret [[TMP1]] +// +fixed_bool32_t from_vbool32_t(vbool32_t type) { + return type; +} + +// CHECK-LABEL: @to_vbool32_t( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret [[TYPE_COERCE:%.*]] +// +vbool32_t to_vbool32_t(fixed_bool32_t type) { + return type; +} + // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] +// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i32.v8i32( undef, <8 x i32> [[TYPE]], i64 0) // CHECK-NEXT: ret [[CAST_SCALABLE]] // @@ -68,7 +136,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) { // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TYPE:%.*]], i64 0) -// CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) { @@ -77,7 +145,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) { // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i32.v8i32( undef, <8 x i32> [[TYPE]], i64 0) // CHECK-NEXT: ret [[CAST_SCALABLE]] // @@ -88,7 +156,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) { // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-NEXT: store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) { diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c index 959a6c9bf96888..ac22bdce0da3e5 100644 --- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c +++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c @@ -27,11 +27,117 @@ typedef __rvv_uint64m2_t vuint64m2_t; typedef __rvv_float32m2_t vfloat32m2_t; typedef __rvv_float64m2_t vfloat64m2_t; +typedef __rvv_bool1_t vbool1_t; +typedef __rvv_bool4_t vbool4_t; +typedef __rvv_bool32_t vbool32_t; + typedef vint32m1_t fixed_int32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); typedef vint32m2_t fixed_int32m2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 2))); +typedef vint16m4_t fixed_int16m4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 4))); +typedef vint8m8_t fixed_int8m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 8))); +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4))); +typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32))); fixed_int32m1_t global_vec; fixed_int32m2_t global_vec_m2; +fixed_int8m8_t global_vec_int8m8; +fixed_int16m4_t global_vec_int16m4; +fixed_bool1_t global_bool1; +fixed_bool4_t global_bool4; +fixed_bool32_t global_bool32; + +// CHECK-LABEL: @test_bool1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <256 x i8>, align 8 +// CHECK-NEXT: [[M_ADDR:%.*]] = alloca , align 1 +// CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 1 +// CHECK-NEXT: [[MASK:%.*]] = alloca , align 1 +// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32 +// CHECK-NEXT: store [[M:%.*]], ptr [[M_ADDR]], align 1 +// CHECK-NEXT: store [[VEC:%.*]], ptr [[VEC_ADDR]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[M_ADDR]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @global_bool1, align 8 +// CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[SAVED_VALUE]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.riscv.vmand.nxv64i1.i64( [[TMP0]], [[TMP2]], i64 256) +// CHECK-NEXT: store [[TMP3]], ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load , ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load , ptr [[VEC_ADDR]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = load <256 x i8>, ptr @global_vec_int8m8, align 8 +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = call @llvm.vector.insert.nxv64i8.v256i8( undef, <256 x i8> [[TMP6]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = call @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64( poison, [[TMP5]], [[CAST_SCALABLE]], [[TMP4]], i64 256, i64 3) +// CHECK-NEXT: [[CAST_FIXED:%.*]] = call <256 x i8> @llvm.vector.extract.v256i8.nxv64i8( [[TMP7]], i64 0) +// CHECK-NEXT: store <256 x i8> [[CAST_FIXED]], ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load <256 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[CAST_SCALABLE1:%.*]] = call @llvm.vector.insert.nxv64i8.v256i8( undef, <256 x i8> [[TMP8]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE1]] +// +fixed_int8m8_t test_bool1(vbool1_t m, vint8m8_t vec) { + vbool1_t mask = __riscv_vmand(m, global_bool1, __riscv_v_fixed_vlen); + return __riscv_vadd(mask, vec, global_vec_int8m8, __riscv_v_fixed_vlen); +} + +// CHECK-LABEL: @test_bool4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <64 x i16>, align 8 +// CHECK-NEXT: [[M_ADDR:%.*]] = alloca , align 1 +// CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 2 +// CHECK-NEXT: [[MASK:%.*]] = alloca , align 1 +// CHECK-NEXT: store [[M:%.*]], ptr [[M_ADDR]], align 1 +// CHECK-NEXT: store [[VEC:%.*]], ptr [[VEC_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[M_ADDR]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @global_bool4, align 8 +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = call @llvm.vector.insert.nxv2i8.v8i8( undef, <8 x i8> [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.riscv.vmand.nxv16i1.i64( [[TMP0]], [[TMP2]], i64 64) +// CHECK-NEXT: store [[TMP3]], ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load , ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load , ptr [[VEC_ADDR]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load <64 x i16>, ptr @global_vec_int16m4, align 8 +// CHECK-NEXT: [[CAST_SCALABLE1:%.*]] = call @llvm.vector.insert.nxv16i16.v64i16( undef, <64 x i16> [[TMP6]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = call @llvm.riscv.vadd.mask.nxv16i16.nxv16i16.i64( poison, [[TMP5]], [[CAST_SCALABLE1]], [[TMP4]], i64 64, i64 3) +// CHECK-NEXT: [[CAST_FIXED:%.*]] = call <64 x i16> @llvm.vector.extract.v64i16.nxv16i16( [[TMP7]], i64 0) +// CHECK-NEXT: store <64 x i16> [[CAST_FIXED]], ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load <64 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[CAST_SCALABLE2:%.*]] = call @llvm.vector.insert.nxv16i16.v64i16( undef, <64 x i16> [[TMP8]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE2]] +// +fixed_int16m4_t test_bool4(vbool4_t m, vint16m4_t vec) { + vbool4_t mask = __riscv_vmand(m, global_bool4, __riscv_v_fixed_vlen/4); + return __riscv_vadd(mask, vec, global_vec_int16m4, __riscv_v_fixed_vlen/4); +} + +// CHECK-LABEL: @test_bool32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <8 x i32>, align 8 +// CHECK-NEXT: [[M_ADDR:%.*]] = alloca , align 1 +// CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 4 +// CHECK-NEXT: [[MASK:%.*]] = alloca , align 1 +// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 +// CHECK-NEXT: store [[M:%.*]], ptr [[M_ADDR]], align 1 +// CHECK-NEXT: store [[VEC:%.*]], ptr [[VEC_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[M_ADDR]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr @global_bool32, align 1 +// CHECK-NEXT: store <1 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[SAVED_VALUE]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.riscv.vmand.nxv2i1.i64( [[TMP0]], [[TMP2]], i64 8) +// CHECK-NEXT: store [[TMP3]], ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load , ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load , ptr [[VEC_ADDR]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @global_vec, align 8 +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = call @llvm.vector.insert.nxv2i32.v8i32( undef, <8 x i32> [[TMP6]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = call @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64( poison, [[TMP5]], [[CAST_SCALABLE]], [[TMP4]], i64 8, i64 3) +// CHECK-NEXT: [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TMP7]], i64 0) +// CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[CAST_SCALABLE1:%.*]] = call @llvm.vector.insert.nxv2i32.v8i32( undef, <8 x i32> [[TMP8]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE1]] +// +fixed_int32m1_t test_bool32(vbool32_t m, vint32m1_t vec) { + vbool32_t mask = __riscv_vmand(m, global_bool32, __riscv_v_fixed_vlen/32); + return __riscv_vadd(mask, vec, global_vec, __riscv_v_fixed_vlen/32); +} // CHECK-LABEL: @test_ptr_to_global( // CHECK-NEXT: entry: @@ -70,6 +176,72 @@ fixed_int32m1_t array_arg(fixed_int32m1_t arr[]) { return arr[0]; } +// CHECK-LABEL: @address_of_array_idx_bool1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <32 x i8>, align 8 +// CHECK-NEXT: [[ARR:%.*]] = alloca [3 x <32 x i8>], align 8 +// CHECK-NEXT: [[PARR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <32 x i8>], ptr [[ARR]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[ARRAYIDX]], ptr [[PARR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 8 +// CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[RETVAL]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 8 [[RETVAL]], i64 32, i1 false) +// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 8 +// CHECK-NEXT: ret [[TMP2]] +// +fixed_bool1_t address_of_array_idx_bool1() { + fixed_bool1_t arr[3]; + fixed_bool1_t *parr; + parr = &arr[0]; + return *parr; +} + +// CHECK-LABEL: @address_of_array_idx_bool4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[ARR:%.*]] = alloca [3 x <8 x i8>], align 8 +// CHECK-NEXT: [[PARR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[ARR]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[ARRAYIDX]], ptr [[PARR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[TMP0]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = call @llvm.vector.insert.nxv2i8.v8i8( undef, <8 x i8> [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-NEXT: ret [[TMP3]] +// +fixed_bool4_t address_of_array_idx_bool4() { + fixed_bool4_t arr[3]; + fixed_bool4_t *parr; + parr = &arr[0]; + return *parr; +} + +// CHECK-LABEL: @address_of_array_idx_bool32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 +// CHECK-NEXT: [[ARR:%.*]] = alloca [3 x <1 x i8>], align 1 +// CHECK-NEXT: [[PARR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i8>], ptr [[ARR]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[ARRAYIDX]], ptr [[PARR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[TMP0]], align 1 +// CHECK-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL]], align 1 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL_COERCE]], ptr align 1 [[RETVAL]], i64 1, i1 false) +// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 +// CHECK-NEXT: ret [[TMP2]] +// +fixed_bool32_t address_of_array_idx_bool32() { + fixed_bool32_t arr[3]; + fixed_bool32_t *parr; + parr = &arr[0]; + return *parr; +} + // CHECK-LABEL: @test_cast( // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca <8 x i32>, align 8 diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c index 8bdcd9af20efca..d7df1a24bbfb00 100644 --- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c +++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c @@ -17,10 +17,25 @@ typedef __rvv_uint64m1_t vuint64m1_t; typedef __rvv_float32m1_t vfloat32m1_t; typedef __rvv_float64m1_t vfloat64m1_t; +typedef __rvv_bool1_t vbool1_t; +typedef __rvv_bool4_t vbool4_t; +typedef __rvv_bool32_t vbool32_t; + typedef vint64m1_t fixed_int64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4))); +#if __riscv_v_fixed_vlen >= 256 +typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32))); +#endif fixed_int64m1_t global_i64; +fixed_bool1_t global_bool1; +fixed_bool4_t global_bool4; +#if __riscv_v_fixed_vlen >= 256 +fixed_bool32_t global_bool32; +#endif + //===----------------------------------------------------------------------===// // WRITES //===----------------------------------------------------------------------===// @@ -39,6 +54,52 @@ fixed_int64m1_t global_i64; // void write_global_i64(vint64m1_t v) { global_i64 = v; } +// CHECK-64-LABEL: @write_global_bool1( +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 8 +// CHECK-64-NEXT: store [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]] +// CHECK-64-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: store <8 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: ret void +// +// CHECK-256-LABEL: @write_global_bool1( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 8 +// CHECK-256-NEXT: store [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]] +// CHECK-256-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: store <32 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: ret void +// +void write_global_bool1(vbool1_t v) { global_bool1 = v; } + +// CHECK-64-LABEL: @write_global_bool4( +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[TMP0:%.*]] = bitcast [[V:%.*]] to +// CHECK-64-NEXT: [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8( [[TMP0]], i64 0) +// CHECK-64-NEXT: store <2 x i8> [[CAST_FIXED]], ptr @global_bool4, align 2, !tbaa [[TBAA4]] +// CHECK-64-NEXT: ret void +// +// CHECK-256-LABEL: @write_global_bool4( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast [[V:%.*]] to +// CHECK-256-NEXT: [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8( [[TMP0]], i64 0) +// CHECK-256-NEXT: store <8 x i8> [[CAST_FIXED]], ptr @global_bool4, align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: ret void +// +void write_global_bool4(vbool4_t v) { global_bool4 = v; } + +#if __riscv_v_fixed_vlen >= 256 +// CHECK-256-LABEL: @write_global_bool32( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 +// CHECK-256-NEXT: store [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]] +// CHECK-256-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]] +// CHECK-256-NEXT: store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA4]] +// CHECK-256-NEXT: ret void +// +void write_global_bool32(vbool32_t v) { global_bool32 = v; } +#endif + //===----------------------------------------------------------------------===// // READS //===----------------------------------------------------------------------===// @@ -56,3 +117,49 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; } // CHECK-256-NEXT: ret [[CAST_SCALABLE]] // vint64m1_t read_global_i64() { return global_i64; } + +// CHECK-64-LABEL: @read_global_bool1( +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8 +// CHECK-64-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]] +// CHECK-64-NEXT: ret [[TMP1]] +// +// CHECK-256-LABEL: @read_global_bool1( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32 +// CHECK-256-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]] +// CHECK-256-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]] +// CHECK-256-NEXT: ret [[TMP1]] +// +vbool1_t read_global_bool1() { return global_bool1; } + +// CHECK-64-LABEL: @read_global_bool4( +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool4, align 2, !tbaa [[TBAA4]] +// CHECK-64-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i8.v2i8( undef, <2 x i8> [[TMP0]], i64 0) +// CHECK-64-NEXT: [[TMP1:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-64-NEXT: ret [[TMP1]] +// +// CHECK-256-LABEL: @read_global_bool4( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool4, align 8, !tbaa [[TBAA4]] +// CHECK-256-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i8.v8i8( undef, <8 x i8> [[TMP0]], i64 0) +// CHECK-256-NEXT: [[TMP1:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-256-NEXT: ret [[TMP1]] +// +vbool4_t read_global_bool4() { return global_bool4; } + +#if __riscv_v_fixed_vlen >= 256 +// CHECK-256-LABEL: @read_global_bool32( +// CHECK-256-NEXT: entry: +// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 +// CHECK-256-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[TBAA4]] +// CHECK-256-NEXT: store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]] +// CHECK-256-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]] +// CHECK-256-NEXT: ret [[TMP1]] +// +vbool32_t read_global_bool32() { return global_bool32; } +#endif diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c index 85a320ba50d243..027f7ab24aa120 100644 --- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c +++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c @@ -8,6 +8,14 @@ #include +typedef __rvv_bool64_t vbool64_t; +typedef __rvv_bool32_t vbool32_t; +typedef __rvv_bool16_t vbool16_t; +typedef __rvv_bool8_t vbool8_t; +typedef __rvv_bool4_t vbool4_t; +typedef __rvv_bool2_t vbool2_t; +typedef __rvv_bool1_t vbool1_t; + typedef __rvv_int8mf8_t vint8mf8_t; typedef __rvv_uint8mf8_t vuint8mf8_t; @@ -141,6 +149,20 @@ typedef vuint64m8_t fixed_uint64m8_t __attribute__((riscv_rvv_vector_bits(__risc typedef vfloat32m8_t fixed_float32m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 8))); typedef vfloat64m8_t fixed_float64m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 8))); +#if __riscv_v_fixed_vlen / 64 >= 8 +typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 64))); +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 32))); +#endif +#if __riscv_v_fixed_vlen / 16 >= 8 +typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 16))); +#endif +typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 8))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 4))); +typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 2))); +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); + //===----------------------------------------------------------------------===// // Structs and unions //===----------------------------------------------------------------------===// @@ -198,6 +220,20 @@ DEFINE_STRUCT(uint64m8) DEFINE_STRUCT(float32m8) DEFINE_STRUCT(float64m8) +DEFINE_STRUCT(bool1) +DEFINE_STRUCT(bool2) +DEFINE_STRUCT(bool4) +DEFINE_STRUCT(bool8) +#if __riscv_v_fixed_vlen / 16 >= 8 +DEFINE_STRUCT(bool16) +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +DEFINE_STRUCT(bool32) +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +DEFINE_STRUCT(bool64) +#endif + DEFINE_UNION(int8m1) DEFINE_UNION(int16m1) DEFINE_UNION(int32m1) @@ -242,6 +278,20 @@ DEFINE_UNION(uint64m8) DEFINE_UNION(float32m8) DEFINE_UNION(float64m8) +DEFINE_UNION(bool1) +DEFINE_UNION(bool2) +DEFINE_UNION(bool4) +DEFINE_UNION(bool8) +#if __riscv_v_fixed_vlen / 16 >= 8 +DEFINE_UNION(bool16) +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +DEFINE_UNION(bool32) +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +DEFINE_UNION(bool64) +#endif + //===----------------------------------------------------------------------===// // Global variables //===----------------------------------------------------------------------===// @@ -297,6 +347,20 @@ fixed_uint64m8_t global_u64m8; fixed_float32m8_t global_f32m8; fixed_float64m8_t global_f64m8; +fixed_bool1_t global_bool1; +fixed_bool2_t global_bool2; +fixed_bool4_t global_bool4; +fixed_bool8_t global_bool8; +#if __riscv_v_fixed_vlen / 16 >= 8 +fixed_bool16_t global_bool16; +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +fixed_bool32_t global_bool32; +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +fixed_bool64_t global_bool64; +#endif + //===----------------------------------------------------------------------===// // Global arrays //===----------------------------------------------------------------------===// @@ -352,6 +416,20 @@ fixed_uint64m8_t global_arr_u64m8[3]; fixed_float32m8_t global_arr_f32m8[3]; fixed_float64m8_t global_arr_f64m8[3]; +fixed_bool1_t global_arr_bool1[3]; +fixed_bool2_t global_arr_bool2[3]; +fixed_bool4_t global_arr_bool4[3]; +fixed_bool8_t global_arr_bool8[3]; +#if __riscv_v_fixed_vlen / 16 >= 8 +fixed_bool16_t global_arr_bool16[3]; +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +fixed_bool32_t global_arr_bool32[3]; +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +fixed_bool64_t global_arr_bool64[3]; +#endif + //===----------------------------------------------------------------------===// // Locals //===----------------------------------------------------------------------===// @@ -401,6 +479,20 @@ void f() { fixed_float32m8_t local_f32m8; fixed_float64m8_t local_f64m8; + fixed_bool1_t local_bool1; + fixed_bool2_t local_bool2; + fixed_bool4_t local_bool4; + fixed_bool8_t local_bool8; +#if __riscv_v_fixed_vlen / 16 >= 8 + fixed_bool16_t local_bool16; +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 + fixed_bool32_t local_bool32; +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 + fixed_bool64_t local_bool64; +#endif + // Arrays fixed_int8m1_t local_arr_i8[3]; fixed_int16m1_t local_arr_i16[3]; @@ -461,6 +553,20 @@ void f() { fixed_int8mf8_t local_arr_i8mf8[3]; fixed_uint8mf8_t local_arr_u8mf8[3]; + + fixed_bool1_t local_arr_bool1[3]; + fixed_bool2_t local_arr_bool2[3]; + fixed_bool4_t local_arr_bool4[3]; + fixed_bool8_t local_arr_bool8[3]; +#if __riscv_v_fixed_vlen / 16 >= 8 + fixed_bool16_t local_arr_bool16[3]; +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 + fixed_bool32_t local_arr_bool32[3]; +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 + fixed_bool64_t local_arr_bool64[3]; +#endif } //===----------------------------------------------------------------------===// @@ -506,6 +612,10 @@ void f() { // CHECK-64-NEXT: %struct.struct_uint64m8 = type { <8 x i64> } // CHECK-64-NEXT: %struct.struct_float32m8 = type { <16 x float> } // CHECK-64-NEXT: %struct.struct_float64m8 = type { <8 x double> } +// CHECK-64-NEXT: %struct.struct_bool1 = type { <8 x i8> } +// CHECK-64-NEXT: %struct.struct_bool2 = type { <4 x i8> } +// CHECK-64-NEXT: %struct.struct_bool4 = type { <2 x i8> } +// CHECK-64-NEXT: %struct.struct_bool8 = type { <1 x i8> } // CHECK-128: %struct.struct_int8m1 = type { <16 x i8> } // CHECK-128-NEXT: %struct.struct_int16m1 = type { <8 x i16> } @@ -547,6 +657,11 @@ void f() { // CHECK-128-NEXT: %struct.struct_uint64m8 = type { <16 x i64> } // CHECK-128-NEXT: %struct.struct_float32m8 = type { <32 x float> } // CHECK-128-NEXT: %struct.struct_float64m8 = type { <16 x double> } +// CHECK-128-NEXT: %struct.struct_bool1 = type { <16 x i8> } +// CHECK-128-NEXT: %struct.struct_bool2 = type { <8 x i8> } +// CHECK-128-NEXT: %struct.struct_bool4 = type { <4 x i8> } +// CHECK-128-NEXT: %struct.struct_bool8 = type { <2 x i8> } +// CHECK-128-NEXT: %struct.struct_bool16 = type { <1 x i8> } // CHECK-256: %struct.struct_int8m1 = type { <32 x i8> } // CHECK-256-NEXT: %struct.struct_int16m1 = type { <16 x i16> } @@ -587,6 +702,13 @@ void f() { // CHECK-256-NEXT: %struct.struct_uint32m8 = type { <64 x i32> } // CHECK-256-NEXT: %struct.struct_uint64m8 = type { <32 x i64> } // CHECK-256-NEXT: %struct.struct_float32m8 = type { <64 x float> } +// CHECK-256-NEXT: %struct.struct_float64m8 = type { <32 x double> } +// CHECK-256-NEXT: %struct.struct_bool1 = type { <32 x i8> } +// CHECK-256-NEXT: %struct.struct_bool2 = type { <16 x i8> } +// CHECK-256-NEXT: %struct.struct_bool4 = type { <8 x i8> } +// CHECK-256-NEXT: %struct.struct_bool8 = type { <4 x i8> } +// CHECK-256-NEXT: %struct.struct_bool16 = type { <2 x i8> } +// CHECK-256-NEXT: %struct.struct_bool32 = type { <1 x i8> } // CHECK-512: %struct.struct_int8m1 = type { <64 x i8> } // CHECK-512-NEXT: %struct.struct_int16m1 = type { <32 x i16> } @@ -627,6 +749,14 @@ void f() { // CHECK-512-NEXT: %struct.struct_uint32m8 = type { <128 x i32> } // CHECK-512-NEXT: %struct.struct_uint64m8 = type { <64 x i64> } // CHECK-512-NEXT: %struct.struct_float32m8 = type { <128 x float> } +// CHECK-512-NEXT: %struct.struct_float64m8 = type { <64 x double> } +// CHECK-512-NEXT: %struct.struct_bool1 = type { <64 x i8> } +// CHECK-512-NEXT: %struct.struct_bool2 = type { <32 x i8> } +// CHECK-512-NEXT: %struct.struct_bool4 = type { <16 x i8> } +// CHECK-512-NEXT: %struct.struct_bool8 = type { <8 x i8> } +// CHECK-512-NEXT: %struct.struct_bool16 = type { <4 x i8> } +// CHECK-512-NEXT: %struct.struct_bool32 = type { <2 x i8> } +// CHECK-512-NEXT: %struct.struct_bool64 = type { <1 x i8> } // CHECK-1024: %struct.struct_int8m1 = type { <128 x i8> } // CHECK-1024-NEXT: %struct.struct_int16m1 = type { <64 x i16> } @@ -667,6 +797,14 @@ void f() { // CHECK-1024-NEXT: %struct.struct_uint32m8 = type { <256 x i32> } // CHECK-1024-NEXT: %struct.struct_uint64m8 = type { <128 x i64> } // CHECK-1024-NEXT: %struct.struct_float32m8 = type { <256 x float> } +// CHECK-1024-NEXT: %struct.struct_float64m8 = type { <128 x double> } +// CHECK-1024-NEXT: %struct.struct_bool1 = type { <128 x i8> } +// CHECK-1024-NEXT: %struct.struct_bool2 = type { <64 x i8> } +// CHECK-1024-NEXT: %struct.struct_bool4 = type { <32 x i8> } +// CHECK-1024-NEXT: %struct.struct_bool8 = type { <16 x i8> } +// CHECK-1024-NEXT: %struct.struct_bool16 = type { <8 x i8> } +// CHECK-1024-NEXT: %struct.struct_bool32 = type { <4 x i8> } +// CHECK-1024-NEXT: %struct.struct_bool64 = type { <2 x i8> } // CHECK-64: %union.union_int8m1 = type { <8 x i8> } // CHECK-64-NEXT: %union.union_int16m1 = type { <4 x i16> } @@ -708,6 +846,10 @@ void f() { // CHECK-64-NEXT: %union.union_uint64m8 = type { <8 x i64> } // CHECK-64-NEXT: %union.union_float32m8 = type { <16 x float> } // CHECK-64-NEXT: %union.union_float64m8 = type { <8 x double> } +// CHECK-64-NEXT: %union.union_bool1 = type { <8 x i8> } +// CHECK-64-NEXT: %union.union_bool2 = type { <4 x i8> } +// CHECK-64-NEXT: %union.union_bool4 = type { <2 x i8> } +// CHECK-64-NEXT: %union.union_bool8 = type { <1 x i8> } // CHECK-128: %union.union_int8m1 = type { <16 x i8> } // CHECK-128-NEXT: %union.union_int16m1 = type { <8 x i16> } @@ -749,6 +891,11 @@ void f() { // CHECK-128-NEXT: %union.union_uint64m8 = type { <16 x i64> } // CHECK-128-NEXT: %union.union_float32m8 = type { <32 x float> } // CHECK-128-NEXT: %union.union_float64m8 = type { <16 x double> } +// CHECK-128-NEXT: %union.union_bool1 = type { <16 x i8> } +// CHECK-128-NEXT: %union.union_bool2 = type { <8 x i8> } +// CHECK-128-NEXT: %union.union_bool4 = type { <4 x i8> } +// CHECK-128-NEXT: %union.union_bool8 = type { <2 x i8> } +// CHECK-128-NEXT: %union.union_bool16 = type { <1 x i8> } // CHECK-256: %union.union_int8m1 = type { <32 x i8> } // CHECK-256-NEXT: %union.union_int16m1 = type { <16 x i16> } @@ -790,6 +937,12 @@ void f() { // CHECK-256-NEXT: %union.union_uint64m8 = type { <32 x i64> } // CHECK-256-NEXT: %union.union_float32m8 = type { <64 x float> } // CHECK-256-NEXT: %union.union_float64m8 = type { <32 x double> } +// CHECK-256-NEXT: %union.union_bool1 = type { <32 x i8> } +// CHECK-256-NEXT: %union.union_bool2 = type { <16 x i8> } +// CHECK-256-NEXT: %union.union_bool4 = type { <8 x i8> } +// CHECK-256-NEXT: %union.union_bool8 = type { <4 x i8> } +// CHECK-256-NEXT: %union.union_bool16 = type { <2 x i8> } +// CHECK-256-NEXT: %union.union_bool32 = type { <1 x i8> } // CHECK-512: %union.union_int8m1 = type { <64 x i8> } // CHECK-512-NEXT: %union.union_int16m1 = type { <32 x i16> } @@ -831,6 +984,13 @@ void f() { // CHECK-512-NEXT: %union.union_uint64m8 = type { <64 x i64> } // CHECK-512-NEXT: %union.union_float32m8 = type { <128 x float> } // CHECK-512-NEXT: %union.union_float64m8 = type { <64 x double> } +// CHECK-512-NEXT: %union.union_bool1 = type { <64 x i8> } +// CHECK-512-NEXT: %union.union_bool2 = type { <32 x i8> } +// CHECK-512-NEXT: %union.union_bool4 = type { <16 x i8> } +// CHECK-512-NEXT: %union.union_bool8 = type { <8 x i8> } +// CHECK-512-NEXT: %union.union_bool16 = type { <4 x i8> } +// CHECK-512-NEXT: %union.union_bool32 = type { <2 x i8> } +// CHECK-512-NEXT: %union.union_bool64 = type { <1 x i8> } // CHECK-1024: %union.union_int8m1 = type { <128 x i8> } // CHECK-1024-NEXT: %union.union_int16m1 = type { <64 x i16> } @@ -872,6 +1032,13 @@ void f() { // CHECK-1024-NEXT: %union.union_uint64m8 = type { <128 x i64> } // CHECK-1024-NEXT: %union.union_float32m8 = type { <256 x float> } // CHECK-1024-NEXT: %union.union_float64m8 = type { <128 x double> } +// CHECK-1024-NEXT: %union.union_bool1 = type { <128 x i8> } +// CHECK-1024-NEXT: %union.union_bool2 = type { <64 x i8> } +// CHECK-1024-NEXT: %union.union_bool4 = type { <32 x i8> } +// CHECK-1024-NEXT: %union.union_bool8 = type { <16 x i8> } +// CHECK-1024-NEXT: %union.union_bool16 = type { <8 x i8> } +// CHECK-1024-NEXT: %union.union_bool32 = type { <4 x i8> } +// CHECK-1024-NEXT: %union.union_bool64 = type { <2 x i8> } //===----------------------------------------------------------------------===// // Global variables @@ -916,6 +1083,10 @@ void f() { // CHECK-64-NEXT: @global_u64m8 ={{.*}} global <8 x i64> zeroinitializer, align 8 // CHECK-64-NEXT: @global_f32m8 ={{.*}} global <16 x float> zeroinitializer, align 8 // CHECK-64-NEXT: @global_f64m8 ={{.*}} global <8 x double> zeroinitializer, align 8 +// CHECK-64-NEXT: @global_bool1 ={{.*}} global <8 x i8> zeroinitializer, align 8 +// CHECK-64-NEXT: @global_bool2 ={{.*}} global <4 x i8> zeroinitializer, align 4 +// CHECK-64-NEXT: @global_bool4 ={{.*}} global <2 x i8> zeroinitializer, align 2 +// CHECK-64-NEXT: @global_bool8 ={{.*}} global <1 x i8> zeroinitializer, align 1 // CHECK-128: @global_i8 ={{.*}} global <16 x i8> zeroinitializer, align 8 // CHECK-128-NEXT: @global_i16 ={{.*}} global <8 x i16> zeroinitializer, align 8 @@ -957,6 +1128,11 @@ void f() { // CHECK-128-NEXT: @global_u64m8 ={{.*}} global <16 x i64> zeroinitializer, align 8 // CHECK-128-NEXT: @global_f32m8 ={{.*}} global <32 x float> zeroinitializer, align 8 // CHECK-128-NEXT: @global_f64m8 ={{.*}} global <16 x double> zeroinitializer, align 8 +// CHECK-128-NEXT: @global_bool1 ={{.*}} global <16 x i8> zeroinitializer, align 8 +// CHECK-128-NEXT: @global_bool2 ={{.*}} global <8 x i8> zeroinitializer, align 8 +// CHECK-128-NEXT: @global_bool4 ={{.*}} global <4 x i8> zeroinitializer, align 4 +// CHECK-128-NEXT: @global_bool8 ={{.*}} global <2 x i8> zeroinitializer, align 2 +// CHECK-128-NEXT: @global_bool16 ={{.*}} global <1 x i8> zeroinitializer, align 1 // CHECK-256: @global_i8 ={{.*}} global <32 x i8> zeroinitializer, align 8 // CHECK-256-NEXT: @global_i16 ={{.*}} global <16 x i16> zeroinitializer, align 8 @@ -998,6 +1174,12 @@ void f() { // CHECK-256-NEXT: @global_u64m8 ={{.*}} global <32 x i64> zeroinitializer, align 8 // CHECK-256-NEXT: @global_f32m8 ={{.*}} global <64 x float> zeroinitializer, align 8 // CHECK-256-NEXT: @global_f64m8 ={{.*}} global <32 x double> zeroinitializer, align 8 +// CHECK-256-NEXT: @global_bool1 ={{.*}} global <32 x i8> zeroinitializer, align 8 +// CHECK-256-NEXT: @global_bool2 ={{.*}} global <16 x i8> zeroinitializer, align 8 +// CHECK-256-NEXT: @global_bool4 ={{.*}} global <8 x i8> zeroinitializer, align 8 +// CHECK-256-NEXT: @global_bool8 ={{.*}} global <4 x i8> zeroinitializer, align 4 +// CHECK-256-NEXT: @global_bool16 ={{.*}} global <2 x i8> zeroinitializer, align 2 +// CHECK-256-NEXT: @global_bool32 ={{.*}} global <1 x i8> zeroinitializer, align 1 // CHECK-512: @global_i8 ={{.*}} global <64 x i8> zeroinitializer, align 8 // CHECK-512-NEXT: @global_i16 ={{.*}} global <32 x i16> zeroinitializer, align 8 @@ -1039,6 +1221,13 @@ void f() { // CHECK-512-NEXT: @global_u64m8 ={{.*}} global <64 x i64> zeroinitializer, align 8 // CHECK-512-NEXT: @global_f32m8 ={{.*}} global <128 x float> zeroinitializer, align 8 // CHECK-512-NEXT: @global_f64m8 ={{.*}} global <64 x double> zeroinitializer, align 8 +// CHECK-512-NEXT: @global_bool1 ={{.*}} global <64 x i8> zeroinitializer, align 8 +// CHECK-512-NEXT: @global_bool2 ={{.*}} global <32 x i8> zeroinitializer, align 8 +// CHECK-512-NEXT: @global_bool4 ={{.*}} global <16 x i8> zeroinitializer, align 8 +// CHECK-512-NEXT: @global_bool8 ={{.*}} global <8 x i8> zeroinitializer, align 8 +// CHECK-512-NEXT: @global_bool16 ={{.*}} global <4 x i8> zeroinitializer, align 4 +// CHECK-512-NEXT: @global_bool32 ={{.*}} global <2 x i8> zeroinitializer, align 2 +// CHECK-512-NEXT: @global_bool64 ={{.*}} global <1 x i8> zeroinitializer, align 1 // CHECK-1024: @global_i8 ={{.*}} global <128 x i8> zeroinitializer, align 8 // CHECK-1024-NEXT: @global_i16 ={{.*}} global <64 x i16> zeroinitializer, align 8 @@ -1080,6 +1269,13 @@ void f() { // CHECK-1024-NEXT: @global_u64m8 ={{.*}} global <128 x i64> zeroinitializer, align 8 // CHECK-1024-NEXT: @global_f32m8 ={{.*}} global <256 x float> zeroinitializer, align 8 // CHECK-1024-NEXT: @global_f64m8 ={{.*}} global <128 x double> zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_bool1 ={{.*}} global <128 x i8> zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_bool2 ={{.*}} global <64 x i8> zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_bool4 ={{.*}} global <32 x i8> zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_bool8 ={{.*}} global <16 x i8> zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_bool16 ={{.*}} global <8 x i8> zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_bool32 ={{.*}} global <4 x i8> zeroinitializer, align 4 +// CHECK-1024-NEXT: @global_bool64 ={{.*}} global <2 x i8> zeroinitializer, align 2 //===----------------------------------------------------------------------===// // Global arrays @@ -1124,6 +1320,10 @@ void f() { // CHECK-64-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <8 x i64>] zeroinitializer, align 8 // CHECK-64-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <16 x float>] zeroinitializer, align 8 // CHECK-64-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <8 x double>] zeroinitializer, align 8 +// CHECK-64-NEXT: @global_arr_bool1 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8 +// CHECK-64-NEXT: @global_arr_bool2 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4 +// CHECK-64-NEXT: @global_arr_bool4 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2 +// CHECK-64-NEXT: @global_arr_bool8 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1 // CHECK-128: @global_arr_i8 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8 // CHECK-128-NEXT: @global_arr_i16 ={{.*}} global [3 x <8 x i16>] zeroinitializer, align 8 @@ -1165,6 +1365,11 @@ void f() { // CHECK-128-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <16 x i64>] zeroinitializer, align 8 // CHECK-128-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <32 x float>] zeroinitializer, align 8 // CHECK-128-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <16 x double>] zeroinitializer, align 8 +// CHECK-128-NEXT: @global_arr_bool1 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8 +// CHECK-128-NEXT: @global_arr_bool2 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8 +// CHECK-128-NEXT: @global_arr_bool4 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4 +// CHECK-128-NEXT: @global_arr_bool8 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2 +// CHECK-128-NEXT: @global_arr_bool16 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1 // CHECK-256: @global_arr_i8 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8 // CHECK-256-NEXT: @global_arr_i16 ={{.*}} global [3 x <16 x i16>] zeroinitializer, align 8 @@ -1206,6 +1411,13 @@ void f() { // CHECK-256-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <32 x i64>] zeroinitializer, align 8 // CHECK-256-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <64 x float>] zeroinitializer, align 8 // CHECK-256-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <32 x double>] zeroinitializer, align 8 +// CHECK-256-NEXT: @global_arr_bool1 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8 +// CHECK-256-NEXT: @global_arr_bool2 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8 +// CHECK-256-NEXT: @global_arr_bool4 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8 +// CHECK-256-NEXT: @global_arr_bool8 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4 +// CHECK-256-NEXT: @global_arr_bool16 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2 +// CHECK-256-NEXT: @global_arr_bool32 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1 + // CHECK-512: @global_arr_i8 ={{.*}} global [3 x <64 x i8>] zeroinitializer, align 8 // CHECK-512-NEXT: @global_arr_i16 ={{.*}} global [3 x <32 x i16>] zeroinitializer, align 8 // CHECK-512-NEXT: @global_arr_i32 ={{.*}} global [3 x <16 x i32>] zeroinitializer, align 8 @@ -1246,6 +1458,13 @@ void f() { // CHECK-512-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <64 x i64>] zeroinitializer, align 8 // CHECK-512-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <128 x float>] zeroinitializer, align 8 // CHECK-512-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <64 x double>] zeroinitializer, align 8 +// CHECK-512-NEXT: @global_arr_bool1 ={{.*}} global [3 x <64 x i8>] zeroinitializer, align 8 +// CHECK-512-NEXT: @global_arr_bool2 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8 +// CHECK-512-NEXT: @global_arr_bool4 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8 +// CHECK-512-NEXT: @global_arr_bool8 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8 +// CHECK-512-NEXT: @global_arr_bool16 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4 +// CHECK-512-NEXT: @global_arr_bool32 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2 +// CHECK-512-NEXT: @global_arr_bool64 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1 // CHECK-1024: @global_arr_i8 ={{.*}} global [3 x <128 x i8>] zeroinitializer, align 8 // CHECK-1024-NEXT: @global_arr_i16 ={{.*}} global [3 x <64 x i16>] zeroinitializer, align 8 @@ -1287,6 +1506,13 @@ void f() { // CHECK-1024-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <128 x i64>] zeroinitializer, align 8 // CHECK-1024-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <256 x float>] zeroinitializer, align 8 // CHECK-1024-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <128 x double>] zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_arr_bool1 ={{.*}} global [3 x <128 x i8>] zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_arr_bool2 ={{.*}} global [3 x <64 x i8>] zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_arr_bool4 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_arr_bool8 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_arr_bool16 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8 +// CHECK-1024-NEXT: @global_arr_bool32 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4 +// CHECK-1024-NEXT: @global_arr_bool64 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2 //===----------------------------------------------------------------------===// // Local variables @@ -1331,6 +1557,10 @@ void f() { // CHECK-64-NEXT: %local_u64m8 = alloca <8 x i64>, align 8 // CHECK-64-NEXT: %local_f32m8 = alloca <16 x float>, align 8 // CHECK-64-NEXT: %local_f64m8 = alloca <8 x double>, align 8 +// CHECK-64-NEXT: %local_bool1 = alloca <8 x i8>, align 8 +// CHECK-64-NEXT: %local_bool2 = alloca <4 x i8>, align 4 +// CHECK-64-NEXT: %local_bool4 = alloca <2 x i8>, align 2 +// CHECK-64-NEXT: %local_bool8 = alloca <1 x i8>, align 1 // CHECK-128: %local_i8 = alloca <16 x i8>, align 8 // CHECK-128-NEXT: %local_i16 = alloca <8 x i16>, align 8 @@ -1372,6 +1602,11 @@ void f() { // CHECK-128-NEXT: %local_u64m8 = alloca <16 x i64>, align 8 // CHECK-128-NEXT: %local_f32m8 = alloca <32 x float>, align 8 // CHECK-128-NEXT: %local_f64m8 = alloca <16 x double>, align 8 +// CHECK-128-NEXT: %local_bool1 = alloca <16 x i8>, align 8 +// CHECK-128-NEXT: %local_bool2 = alloca <8 x i8>, align 8 +// CHECK-128-NEXT: %local_bool4 = alloca <4 x i8>, align 4 +// CHECK-128-NEXT: %local_bool8 = alloca <2 x i8>, align 2 +// CHECK-128-NEXT: %local_bool16 = alloca <1 x i8>, align 1 // CHECK-256: %local_i8 = alloca <32 x i8>, align 8 // CHECK-256-NEXT: %local_i16 = alloca <16 x i16>, align 8 @@ -1413,6 +1648,12 @@ void f() { // CHECK-256-NEXT: %local_u64m8 = alloca <32 x i64>, align 8 // CHECK-256-NEXT: %local_f32m8 = alloca <64 x float>, align 8 // CHECK-256-NEXT: %local_f64m8 = alloca <32 x double>, align 8 +// CHECK-256-NEXT: %local_bool1 = alloca <32 x i8>, align 8 +// CHECK-256-NEXT: %local_bool2 = alloca <16 x i8>, align 8 +// CHECK-256-NEXT: %local_bool4 = alloca <8 x i8>, align 8 +// CHECK-256-NEXT: %local_bool8 = alloca <4 x i8>, align 4 +// CHECK-256-NEXT: %local_bool16 = alloca <2 x i8>, align 2 +// CHECK-256-NEXT: %local_bool32 = alloca <1 x i8>, align 1 // CHECK-512: %local_i8 = alloca <64 x i8>, align 8 // CHECK-512-NEXT: %local_i16 = alloca <32 x i16>, align 8 @@ -1454,6 +1695,13 @@ void f() { // CHECK-512-NEXT: %local_u64m8 = alloca <64 x i64>, align 8 // CHECK-512-NEXT: %local_f32m8 = alloca <128 x float>, align 8 // CHECK-512-NEXT: %local_f64m8 = alloca <64 x double>, align 8 +// CHECK-512-NEXT: %local_bool1 = alloca <64 x i8>, align 8 +// CHECK-512-NEXT: %local_bool2 = alloca <32 x i8>, align 8 +// CHECK-512-NEXT: %local_bool4 = alloca <16 x i8>, align 8 +// CHECK-512-NEXT: %local_bool8 = alloca <8 x i8>, align 8 +// CHECK-512-NEXT: %local_bool16 = alloca <4 x i8>, align 4 +// CHECK-512-NEXT: %local_bool32 = alloca <2 x i8>, align 2 +// CHECK-512-NEXT: %local_bool64 = alloca <1 x i8>, align 1 // CHECK-1024: %local_i8 = alloca <128 x i8>, align 8 // CHECK-1024-NEXT: %local_i16 = alloca <64 x i16>, align 8 @@ -1495,6 +1743,13 @@ void f() { // CHECK-1024-NEXT: %local_u64m8 = alloca <128 x i64>, align 8 // CHECK-1024-NEXT: %local_f32m8 = alloca <256 x float>, align 8 // CHECK-1024-NEXT: %local_f64m8 = alloca <128 x double>, align 8 +// CHECK-1024-NEXT: %local_bool1 = alloca <128 x i8>, align 8 +// CHECK-1024-NEXT: %local_bool2 = alloca <64 x i8>, align 8 +// CHECK-1024-NEXT: %local_bool4 = alloca <32 x i8>, align 8 +// CHECK-1024-NEXT: %local_bool8 = alloca <16 x i8>, align 8 +// CHECK-1024-NEXT: %local_bool16 = alloca <8 x i8>, align 8 +// CHECK-1024-NEXT: %local_bool32 = alloca <4 x i8>, align 4 +// CHECK-1024-NEXT: %local_bool64 = alloca <2 x i8>, align 2 //===----------------------------------------------------------------------===// // Local arrays @@ -1552,6 +1807,10 @@ void f() { // CHECK-64-NEXT: %local_arr_u16mf4 = alloca [3 x <1 x i16>], align 2 // CHECK-64-NEXT: %local_arr_i8mf8 = alloca [3 x <1 x i8>], align 1 // CHECK-64-NEXT: %local_arr_u8mf8 = alloca [3 x <1 x i8>], align 1 +// CHECK-64-NEXT: %local_arr_bool1 = alloca [3 x <8 x i8>], align 8 +// CHECK-64-NEXT: %local_arr_bool2 = alloca [3 x <4 x i8>], align 4 +// CHECK-64-NEXT: %local_arr_bool4 = alloca [3 x <2 x i8>], align 2 +// CHECK-64-NEXT: %local_arr_bool8 = alloca [3 x <1 x i8>], align 1 // CHECK-128: %local_arr_i8 = alloca [3 x <16 x i8>], align 8 // CHECK-128-NEXT: %local_arr_i16 = alloca [3 x <8 x i16>], align 8 @@ -1606,6 +1865,11 @@ void f() { // CHECK-128-NEXT: %local_arr_u16mf4 = alloca [3 x <2 x i16>], align 4 // CHECK-128-NEXT: %local_arr_i8mf8 = alloca [3 x <2 x i8>], align 2 // CHECK-128-NEXT: %local_arr_u8mf8 = alloca [3 x <2 x i8>], align 2 +// CHECK-128-NEXT: %local_arr_bool1 = alloca [3 x <16 x i8>], align 8 +// CHECK-128-NEXT: %local_arr_bool2 = alloca [3 x <8 x i8>], align 8 +// CHECK-128-NEXT: %local_arr_bool4 = alloca [3 x <4 x i8>], align 4 +// CHECK-128-NEXT: %local_arr_bool8 = alloca [3 x <2 x i8>], align 2 +// CHECK-128-NEXT: %local_arr_bool16 = alloca [3 x <1 x i8>], align 1 // CHECK-256: %local_arr_i8 = alloca [3 x <32 x i8>], align 8 // CHECK-256-NEXT: %local_arr_i16 = alloca [3 x <16 x i16>], align 8 @@ -1660,6 +1924,12 @@ void f() { // CHECK-256-NEXT: %local_arr_u16mf4 = alloca [3 x <4 x i16>], align 8 // CHECK-256-NEXT: %local_arr_i8mf8 = alloca [3 x <4 x i8>], align 4 // CHECK-256-NEXT: %local_arr_u8mf8 = alloca [3 x <4 x i8>], align 4 +// CHECK-256-NEXT: %local_arr_bool1 = alloca [3 x <32 x i8>], align 8 +// CHECK-256-NEXT: %local_arr_bool2 = alloca [3 x <16 x i8>], align 8 +// CHECK-256-NEXT: %local_arr_bool4 = alloca [3 x <8 x i8>], align 8 +// CHECK-256-NEXT: %local_arr_bool8 = alloca [3 x <4 x i8>], align 4 +// CHECK-256-NEXT: %local_arr_bool16 = alloca [3 x <2 x i8>], align 2 +// CHECK-256-NEXT: %local_arr_bool32 = alloca [3 x <1 x i8>], align 1 // CHECK-512: %local_arr_i8 = alloca [3 x <64 x i8>], align 8 // CHECK-512-NEXT: %local_arr_i16 = alloca [3 x <32 x i16>], align 8 @@ -1714,6 +1984,13 @@ void f() { // CHECK-512-NEXT: %local_arr_u16mf4 = alloca [3 x <8 x i16>], align 8 // CHECK-512-NEXT: %local_arr_i8mf8 = alloca [3 x <8 x i8>], align 8 // CHECK-512-NEXT: %local_arr_u8mf8 = alloca [3 x <8 x i8>], align 8 +// CHECK-512-NEXT: %local_arr_bool1 = alloca [3 x <64 x i8>], align 8 +// CHECK-512-NEXT: %local_arr_bool2 = alloca [3 x <32 x i8>], align 8 +// CHECK-512-NEXT: %local_arr_bool4 = alloca [3 x <16 x i8>], align 8 +// CHECK-512-NEXT: %local_arr_bool8 = alloca [3 x <8 x i8>], align 8 +// CHECK-512-NEXT: %local_arr_bool16 = alloca [3 x <4 x i8>], align 4 +// CHECK-512-NEXT: %local_arr_bool32 = alloca [3 x <2 x i8>], align 2 +// CHECK-512-NEXT: %local_arr_bool64 = alloca [3 x <1 x i8>], align 1 // CHECK-1024: %local_arr_i8 = alloca [3 x <128 x i8>], align 8 // CHECK-1024-NEXT: %local_arr_i16 = alloca [3 x <64 x i16>], align 8 @@ -1768,3 +2045,10 @@ void f() { // CHECK-1024-NEXT: %local_arr_u16mf4 = alloca [3 x <16 x i16>], align 8 // CHECK-1024-NEXT: %local_arr_i8mf8 = alloca [3 x <16 x i8>], align 8 // CHECK-1024-NEXT: %local_arr_u8mf8 = alloca [3 x <16 x i8>], align 8 +// CHECK-1024-NEXT: %local_arr_bool1 = alloca [3 x <128 x i8>], align 8 +// CHECK-1024-NEXT: %local_arr_bool2 = alloca [3 x <64 x i8>], align 8 +// CHECK-1024-NEXT: %local_arr_bool4 = alloca [3 x <32 x i8>], align 8 +// CHECK-1024-NEXT: %local_arr_bool8 = alloca [3 x <16 x i8>], align 8 +// CHECK-1024-NEXT: %local_arr_bool16 = alloca [3 x <8 x i8>], align 8 +// CHECK-1024-NEXT: %local_arr_bool32 = alloca [3 x <4 x i8>], align 4 +// CHECK-1024-NEXT: %local_arr_bool64 = alloca [3 x <2 x i8>], align 2 diff --git a/clang/test/CodeGen/fat-lto-objects.c b/clang/test/CodeGen/fat-lto-objects.c index afce798c5c8194..b50567c024fc8c 100644 --- a/clang/test/CodeGen/fat-lto-objects.c +++ b/clang/test/CodeGen/fat-lto-objects.c @@ -11,10 +11,11 @@ // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.split.bc %t.full.split.o // RUN: llvm-dis %t.full.split.bc -o - | FileCheck %s --check-prefixes=FULL,SPLIT,NOUNIFIED +/// Full LTO always sets EnableSplitLTOUnit when the summary is used. // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -emit-obj < %s -o %t.full.nosplit.o // RUN: llvm-readelf -S %t.full.nosplit.o | FileCheck %s --check-prefixes=ELF // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.nosplit.bc %t.full.nosplit.o -// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=FULL,NOSPLIT,NOUNIFIED +// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=FULL,SPLIT,NOUNIFIED // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -fsplit-lto-unit -ffat-lto-objects -emit-obj < %s -o %t.thin.split.o // RUN: llvm-readelf -S %t.thin.split.o | FileCheck %s --check-prefixes=ELF @@ -34,6 +35,21 @@ // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -fsplit-lto-unit -S < %s -o - \ // RUN: | FileCheck %s --check-prefixes=ASM +/// Make sure that FatLTO generates .llvm.lto sections that are the same as the output from normal LTO compilations +// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=full -ffat-lto-objects -c %s -o %t.fatlto.full.o +// RUN: llvm-objcopy --dump-section=.llvm.lto=%t.fatlto.full.bc %t.fatlto.full.o +// RUN: llvm-dis < %t.fatlto.full.bc -o %t.fatlto.full.ll +// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=full -c %s -o %t.nofat.full.bc +// RUN: llvm-dis < %t.nofat.full.bc -o %t.nofat.full.ll +// RUN: diff %t.fatlto.full.ll %t.nofat.full.ll + +// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=thin -ffat-lto-objects -c %s -o %t.fatlto.thin.o +// RUN: llvm-objcopy --dump-section=.llvm.lto=%t.fatlto.thin.bc %t.fatlto.thin.o +// RUN: llvm-dis < %t.fatlto.thin.bc -o %t.fatlto.thin.ll +// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=thin -c %s -o %t.nofat.thin.bc +// RUN: llvm-dis < %t.nofat.thin.bc -o %t.nofat.thin.ll +// RUN: diff %t.fatlto.thin.ll %t.nofat.thin.ll + /// Be sure we enable split LTO units correctly under -ffat-lto-objects. // SPLIT: ![[#]] = !{i32 1, !"EnableSplitLTOUnit", i32 1} // NOSPLIT: ![[#]] = !{i32 1, !"EnableSplitLTOUnit", i32 0} @@ -51,6 +67,9 @@ // ASM-NEXT: .asciz "BC // ASM-NEXT: .size .Lllvm.embedded.object +const char* foo = "foo"; + int test(void) { + const char* bar = "bar"; return 0xabcd; } diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp index e5a9d015f22f27..2ef3409f014e98 100644 --- a/clang/test/CodeGenCXX/auto-var-init.cpp +++ b/clang/test/CodeGenCXX/auto-var-init.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,CHECK-O0 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,PATTERN,PATTERN-O0 -// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O1,PATTERN,PATTERN-O1 +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=PATTERN,PATTERN-O1 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,ZERO,ZERO-O0 -// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O1,ZERO,ZERO-O1 +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=ZERO,ZERO-O1 // RUN: %clang_cc1 -std=c++14 -triple i386-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,PATTERN,PATTERN-O0 #pragma clang diagnostic ignored "-Winaccessible-base" @@ -1303,9 +1303,10 @@ TEST_CUSTOM(semivolatile, semivolatile, { 0x44444444, 0x44444444 }); // CHECK-O0: call void @llvm.memcpy // CHECK-NOT: !annotation // CHECK-O0: call void @{{.*}}used{{.*}}%custom) -// CHECK-O1: store i32 1145324612, ptr %custom, align 4 -// CHECK-O1-NEXT: %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 4 -// CHECK-O1-NEXT: store i32 1145324612, ptr %[[I]], align 4 +// PATTERN-O1: store i32 1145324612, ptr %custom, align 4 +// PATTERN-O1-NEXT: %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 4 +// PATTERN-O1-NEXT: store i32 1145324612, ptr %[[I]], align 4 +// ZERO-O1: store i64 4919131752989213764, ptr %custom, align 8 // CHECK-NOT: !annotation TEST_UNINIT(semivolatileinit, semivolatileinit); @@ -1418,7 +1419,8 @@ TEST_CUSTOM(matching, matching, { .f = 0xf00f }); // CHECK-O0: call void @llvm.memcpy // CHECK-NOT: !annotation // CHECK-O0: call void @{{.*}}used{{.*}}%custom) -// CHECK-O1: store float 6.145500e+04, ptr {{.*}}, align 4 +// PATTERN-O1: store float 6.145500e+04, ptr {{.*}}, align 4 +// ZERO-O1: store i32 1198526208, ptr %custom, align 4 // CHECK-NOT: !annotation TEST_UNINIT(matchingreverse, matchingreverse); @@ -1445,7 +1447,8 @@ TEST_CUSTOM(matchingreverse, matchingreverse, { .i = 0xf00f }); // CHECK-O0: call void @llvm.memcpy // CHECK-NOT: !annotation // CHECK-O0: call void @{{.*}}used{{.*}}%custom) -// CHECK-O1: store i32 61455, ptr %custom, align 4 +// PATTERN-O1: store i32 61455, ptr %custom, align 4 +// ZERO-O1: store i32 61455, ptr %custom, align 4 // CHECK-NOT: !annotation TEST_UNINIT(unmatched, unmatched); @@ -1471,7 +1474,8 @@ TEST_CUSTOM(unmatched, unmatched, { .i = 0x3badbeef }); // CHECK-O0: call void @llvm.memcpy // CHECK-NOT: !annotation // CHECK-O0: call void @{{.*}}used{{.*}}%custom) -// CHECK-O1: store i32 1001242351, ptr {{.*}}, align 4 +// PATTERN-O1: store i32 1001242351, ptr {{.*}}, align 4 +// ZERO-O1: store i32 1001242351, ptr {{.*}}, align 4 // CHECK-NOT: !annotation TEST_UNINIT(unmatchedreverse, unmatchedreverse); @@ -1504,9 +1508,7 @@ TEST_CUSTOM(unmatchedreverse, unmatchedreverse, { .c = 42 }); // PATTERN-O1-NEXT: store i8 -86, ptr %[[I]], align {{.*}} // PATTERN-O1-NEXT: %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 3 // PATTERN-O1-NEXT: store i8 -86, ptr %[[I]], align {{.*}} -// ZERO-O1: store i8 42, ptr {{.*}}, align 4 -// ZERO-O1-NEXT: %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 1 -// ZERO-O1-NEXT: call void @llvm.memset.{{.*}}({{.*}}, i8 0, i64 3, {{.*}}) +// ZERO-O1: store i32 42, ptr {{.*}}, align 4 TEST_UNINIT(unmatchedfp, unmatchedfp); // CHECK-LABEL: @test_unmatchedfp_uninit() @@ -1531,7 +1533,8 @@ TEST_CUSTOM(unmatchedfp, unmatchedfp, { .d = 3.1415926535897932384626433 }); // CHECK-O0: call void @llvm.memcpy // CHECK-NOT: !annotation // CHECK-O0: call void @{{.*}}used{{.*}}%custom) -// CHECK-O1: store double 0x400921FB54442D18, ptr %custom, align 8 +// PATTERN-O1: store double 0x400921FB54442D18, ptr %custom, align 8 +// ZERO-O1: store i64 4614256656552045848, ptr %custom, align 8 // CHECK-NOT: !annotation TEST_UNINIT(emptyenum, emptyenum); diff --git a/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp b/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp index fd53649c9b0618..9cf5a7e00e7b4e 100644 --- a/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp +++ b/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp @@ -19,16 +19,22 @@ void CallsTheLambda() { // CHECK: define {{.*}}CallsTheLambda{{.*}} // CHECK-NEXT: entry: -// CHECK-NEXT: %call = call noundef i32 {{.*}}(i32 noundef 1, i32 noundef 2) +// CHECK: {{.*}}call {{.*}}GetALambda{{.*}}() +// CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}(i32 noundef 1, i32 noundef 2) // CHECK-NEXT: ret void // CHECK-NEXT: } +Functor GetAFunctor() { + return {}; +} + void call_static_call_operator() { Functor f; f(101, 102); f.operator()(201, 202); Functor{}(301, 302); Functor::operator()(401, 402); + GetAFunctor()(501, 502); } // CHECK: define {{.*}}call_static_call_operator{{.*}} @@ -37,6 +43,8 @@ void call_static_call_operator() { // CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 201, i32 noundef 202) // CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 301, i32 noundef 302) // CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 401, i32 noundef 402) +// CHECK: {{.*}}call {{.*}}GetAFunctor{{.*}}() +// CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 501, i32 noundef 502) // CHECK-NEXT: ret void // CHECK-NEXT: } @@ -106,12 +114,16 @@ void test_dep_functors() { // CHECK: define {{.*}}test_dep_functors{{.*}} // CHECK-NEXT: entry: -// CHECK: %call = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00) -// CHECK: %call1 = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true) -// CHECK: %call2 = call noundef i32 {{.*}}dep_lambda1{{.*}}(float noundef 1.000000e+00) -// CHECK: %call3 = call noundef i32 {{.*}}dep_lambda1{{.*}}(i1 noundef zeroext true) -// CHECK: %call4 = call noundef i32 {{.*}}dep_lambda2{{.*}}(float noundef 1.000000e+00) -// CHECK: %call5 = call noundef i32 {{.*}}dep_lambda2{{.*}}(i1 noundef zeroext true) +// CHECK: {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00) +// CHECK: {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true) +// CHECK: {{.*}}call {{.*}}dep_lambda1{{.*}}() +// CHECK: {{.*}} = call noundef i32 {{.*}}dep_lambda1{{.*}}(float noundef 1.000000e+00) +// CHECK: {{.*}}call {{.*}}dep_lambda1{{.*}}() +// CHECK: {{.*}} = call noundef i32 {{.*}}dep_lambda1{{.*}}(i1 noundef zeroext true) +// CHECK: {{.*}}call {{.*}}dep_lambda2{{.*}}() +// CHECK: {{.*}} = call noundef i32 {{.*}}dep_lambda2{{.*}}(float noundef 1.000000e+00) +// CHECK: {{.*}}call {{.*}}dep_lambda2{{.*}}() +// CHECK: {{.*}} = call noundef i32 {{.*}}dep_lambda2{{.*}}(i1 noundef zeroext true) // CHECK: ret void // CHECK-NEXT: } diff --git a/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp b/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp index 5dbd2c50cc56bd..5d8258978c50d5 100644 --- a/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp +++ b/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp @@ -7,12 +7,17 @@ struct Functor { } }; +Functor GetAFunctor() { + return {}; +} + void call_static_subscript_operator() { Functor f; f[101, 102]; f.operator[](201, 202); Functor{}[301, 302]; Functor::operator[](401, 402); + GetAFunctor()[501, 502]; } // CHECK: define {{.*}}call_static_subscript_operator{{.*}} @@ -21,6 +26,8 @@ void call_static_subscript_operator() { // CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 201, i32 noundef 202) // CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 301, i32 noundef 302) // CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 401, i32 noundef 402) +// CHECK: {{.*}}call {{.*}}GetAFunctor{{.*}}() +// CHECK-NEXT: {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 501, i32 noundef 502) // CHECK-NEXT: ret void // CHECK-NEXT: } @@ -60,7 +67,7 @@ void test_dep_functors() { // CHECK: define {{.*}}test_dep_functors{{.*}} // CHECK-NEXT: entry: -// CHECK: %call = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00) -// CHECK: %call1 = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true) +// CHECK: {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00) +// CHECK: {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true) // CHECK: ret void // CHECK-NEXT: } diff --git a/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp b/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp index 32bd49f4ff725d..c9e7313a021a5e 100644 --- a/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp +++ b/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp @@ -85,6 +85,14 @@ typedef __rvv_float16m8_t vfloat16m8_t; typedef __rvv_float32m8_t vfloat32m8_t; typedef __rvv_float64m8_t vfloat64m8_t; +typedef __rvv_bool1_t vbool1_t; +typedef __rvv_bool2_t vbool2_t; +typedef __rvv_bool4_t vbool4_t; +typedef __rvv_bool8_t vbool8_t; +typedef __rvv_bool16_t vbool16_t; +typedef __rvv_bool32_t vbool32_t; +typedef __rvv_bool64_t vbool64_t; + typedef vint8mf8_t fixed_int8mf8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/8))); typedef vuint8mf8_t fixed_uint8mf8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/8))); @@ -164,6 +172,20 @@ typedef vfloat16m8_t fixed_float16m8_t __attribute__((riscv_rvv_vector_bits(__ri typedef vfloat32m8_t fixed_float32m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen*8))); typedef vfloat64m8_t fixed_float64m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen*8))); +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/2))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4))); +typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/8))); +#if __riscv_v_fixed_vlen >= 128 +typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/16))); +#endif +#if __riscv_v_fixed_vlen >= 256 +typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32))); +#endif +#if __riscv_v_fixed_vlen >= 512 +typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/64))); +#endif + template struct S {}; // CHECK-64: _Z2f11SI9__RVV_VLSIu14__rvv_int8m1_tLj64EEE @@ -578,3 +600,53 @@ void mf8f1(S) {} // CHECK-512: _Z5mf8f51SI9__RVV_VLSIu16__rvv_uint8mf8_tLj64EEE // CHECK-1024: _Z5mf8f51SI9__RVV_VLSIu16__rvv_uint8mf8_tLj128EEE void mf8f5(S) {} + +// CHECK-64: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj64EEE +// CHECK-128: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj128EEE +// CHECK-256: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj256EEE +// CHECK-512: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj512EEE +// CHECK-1024: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj1024EEE +void bool1(S) {} + +// CHECK-64: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj32EEE +// CHECK-128: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj64EEE +// CHECK-256: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj128EEE +// CHECK-512: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj256EEE +// CHECK-1024: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj512EEE +void bool2(S) {} + +// CHECK-64: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj16EEE +// CHECK-128: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj32EEE +// CHECK-256: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj64EEE +// CHECK-512: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj128EEE +// CHECK-1024: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj256EEE +void bool4(S) {} + +// CHECK-64: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj8EEE +// CHECK-128: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj16EEE +// CHECK-256: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj32EEE +// CHECK-512: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj64EEE +// CHECK-1024: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj128EEE +void bool8(S) {} + +#if __riscv_v_fixed_vlen >= 128 +// CHECK-128: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj8EEE +// CHECK-256: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj16EEE +// CHECK-512: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj32EEE +// CHECK-1024: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj64EEE +// +void bool16(S) {} +#endif + +#if __riscv_v_fixed_vlen >= 256 +// CHECK-256: _Z6bool321SI9__RVV_VLSIu14__rvv_bool32_tLj8EEE +// CHECK-512: _Z6bool321SI9__RVV_VLSIu14__rvv_bool32_tLj16EEE +// CHECK-1024: _Z6bool321SI9__RVV_VLSIu14__rvv_bool32_tLj32EEE +void bool32(S) {} +#endif + +#if __riscv_v_fixed_vlen >= 512 +// CHECK-512: _Z6bool641SI9__RVV_VLSIu14__rvv_bool64_tLj8EEE +// CHECK-1024: _Z6bool641SI9__RVV_VLSIu14__rvv_bool64_tLj16EEE +void bool64(S) {} +#endif diff --git a/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm b/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm new file mode 100644 index 00000000000000..3bbdbebc5cb576 --- /dev/null +++ b/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm @@ -0,0 +1,77 @@ +// RUN: %clang_cc1 -triple aarch64-pc-windows-msvc -fobjc-runtime=gnustep-2.2 -fobjc-dispatch-method=non-legacy -emit-llvm -o - %s | FileCheck %s + +// Pass and return for type size <= 8 bytes. +struct S1 { + int a[2]; +}; + +// Pass and return hfa <= 8 bytes +struct F1 { + float a[2]; +}; + +// Pass and return for type size > 16 bytes. +struct S2 { + int a[5]; +}; + +// Pass and return aggregate (of size < 16 bytes) with non-trivial destructor. +// Sret and inreg: Returned in x0 +struct S3 { + int a[3]; + ~S3(); +}; +S3::~S3() { +} + + +@interface MsgTest { id isa; } @end +@implementation MsgTest +- (S1) smallS1 { + S1 x; + x.a[0] = 0; + x.a[1] = 1; + return x; + +} +- (F1) smallF1 { + F1 x; + x.a[0] = 0.2f; + x.a[1] = 0.5f; + return x; +} +- (S2) stretS2 { + S2 x; + for (int i = 0; i < 5; i++) { + x.a[i] = i; + } + return x; +} +- (S3) stretInRegS3 { + S3 x; + for (int i = 0; i < 3; i++) { + x.a[i] = i; + } + return x; +} ++ (S3) msgTestStretInRegS3 { + S3 x; + for (int i = 0; i < 3; i++) { + x.a[i] = i; + } + return x; +} +@end + +void test0(MsgTest *t) { + // CHECK: call {{.*}} @objc_msgSend + S1 ret = [t smallS1]; + // CHECK: call {{.*}} @objc_msgSend + F1 ret2 = [t smallF1]; + // CHECK: call {{.*}} @objc_msgSend_stret + S2 ret3 = [t stretS2]; + // CHECK: call {{.*}} @objc_msgSend_stret2 + S3 ret4 = [t stretInRegS3]; + // CHECK: call {{.*}} @objc_msgSend_stret2 + S3 ret5 = [MsgTest msgTestStretInRegS3]; +} diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 1ba2b129f6895a..9c8ca0bb96f612 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -100,8 +100,8 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl index 6c84485b66b4a0..edf6dbf8657cbe 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl @@ -30,14 +30,7 @@ __kernel void test_printf_int(int i) { // CHECK-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5) // CHECK-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: [[LOC0:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 0 -// CHECK-NEXT: store i8 102, ptr addrspace(5) [[LOC0]], align 1 -// CHECK-NEXT: [[LOC1:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 1 -// CHECK-NEXT: store i8 111, ptr addrspace(5) [[LOC1]], align 1 -// CHECK-NEXT: [[LOC2:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 2 -// CHECK-NEXT: store i8 111, ptr addrspace(5) [[LOC2]], align 1 -// CHECK-NEXT: [[LOC3:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 3 -// CHECK-NEXT: store i8 0, ptr addrspace(5) [[LOC3]], align 1 +// CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]] diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl index 56d757012a5e78..4e3a56b4201bb6 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl @@ -1,59 +1,60 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); -// CHECK-GFX940-LABEL: @test_cvt_f32_bf8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) +// CHECK-LABEL: @test_cvt_f32_bf8 +// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) void test_cvt_f32_bf8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_bf8(a, 0); } -// CHECK-GFX940-LABEL: @test_cvt_f32_fp8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) +// CHECK-LABEL: @test_cvt_f32_fp8 +// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) void test_cvt_f32_fp8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_fp8(a, 1); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) +// CHECK-LABEL: @test_cvt_pk_f32_bf8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) void test_cvt_pk_f32_bf8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) +// CHECK-LABEL: @test_cvt_pk_f32_fp8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) void test_cvt_pk_f32_fp8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true); } -// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) +// CHECK-LABEL: @test_cvt_pk_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) +// CHECK-LABEL: @test_cvt_pk_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true); } -// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) +// CHECK-LABEL: @test_cvt_sr_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2); } -// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) +// CHECK-LABEL: @test_cvt_sr_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl new file mode 100644 index 00000000000000..11747af7ea74f5 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl @@ -0,0 +1,156 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); + +// Wave32 + +// +// amdgcn_wmma_f32_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f32_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f16_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_bf16_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c); +} + +// +// amdgcn_wmma_i32_16x16x16_iu8 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false); +} + +// +// amdgcn_wmma_i32_16x16x16_iu4 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl new file mode 100644 index 00000000000000..ef32648743ca62 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl @@ -0,0 +1,155 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// Wave64 + +// +// amdgcn_wmma_f32_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f32_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_f16_16x16x16_f16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_bf16_16x16x16_bf16 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c); +} + +// +// amdgcn_wmma_i32_16x16x16_iu8 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false); +} + +// +// amdgcn_wmma_i32_16x16x16_iu4 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl new file mode 100644 index 00000000000000..b303c2f25dddac --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl @@ -0,0 +1,135 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef short v16s __attribute__((ext_vector_type(16))); + +// Wave32 + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl new file mode 100644 index 00000000000000..855fa7351e1556 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +// Wave64 + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true) +// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]]) +// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] +// CHECK-GFX1200-NEXT: ret void +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl index 41a78ae268be57..49cb797df42331 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl @@ -16,7 +16,7 @@ typedef short v16s __attribute__((ext_vector_type(16))); // Wave32 -void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, global v16h* out16h, v16h a16h, v16h b16h, v16h c16h, global v16s* out16s, v2i a2i, v2i b2i, v16s c16s, global v8i* out8i, v4i a4i, v4i b4i, v8i c8i) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl index 4f13c75e5e81f9..3c6aaf5e38281d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl @@ -2,7 +2,6 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100 -typedef float v4f __attribute__((ext_vector_type(4))); typedef float v8f __attribute__((ext_vector_type(8))); typedef half v16h __attribute__((ext_vector_type(16))); typedef int v2i __attribute__((ext_vector_type(2))); @@ -20,7 +19,7 @@ typedef short v16s __attribute__((ext_vector_type(16))); // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]] // CHECK-GFX1100-NEXT: ret void // @@ -35,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -50,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -65,7 +64,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -80,7 +79,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -95,7 +94,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -110,7 +109,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -125,7 +124,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c) // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl index 4797675f50d42e..1490f14fd17b69 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl @@ -3,12 +3,10 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100 typedef float v4f __attribute__((ext_vector_type(4))); -typedef float v8f __attribute__((ext_vector_type(8))); typedef half v8h __attribute__((ext_vector_type(8))); typedef half v16h __attribute__((ext_vector_type(16))); typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); -typedef int v8i __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); typedef short v16s __attribute__((ext_vector_type(16))); @@ -22,7 +20,7 @@ typedef short v16s __attribute__((ext_vector_type(16))); // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]] // CHECK-GFX1100-NEXT: ret void // @@ -37,7 +35,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]]) // CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -52,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -67,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -82,7 +80,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -97,7 +95,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true) // CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -112,7 +110,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // @@ -127,7 +125,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c) // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64( // CHECK-GFX1100-NEXT: entry: -// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) +// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false) // CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]] // CHECK-GFX1100-NEXT: ret void // diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp index 3045ffe43948cb..445cdfc20e2aff 100644 --- a/clang/test/CoverageMapping/if.cpp +++ b/clang/test/CoverageMapping/if.cpp @@ -234,6 +234,35 @@ constexpr int check_macro_consteval_if_skipped(int i) { // CHECK-NEXT: [[@LINE return i; } +struct false_value { + constexpr operator bool() { + return false; + } +}; + +template struct dependable_false_value { + constexpr operator bool() { + return false; + } +}; + +// GH-80285 +void should_not_crash() { + if constexpr (false_value{}) { }; +} + +template void should_not_crash_dependable() { + if constexpr (dependable_false_value{}) { }; +} + +void should_not_crash_with_template_instance() { + should_not_crash_dependable(); +} + +void should_not_crash_with_requires_expr() { + if constexpr (requires {42;}) { }; +} + int instantiate_consteval(int i) { i *= check_consteval_with_else_discarded_then(i); i *= check_notconsteval_with_else_discarded_else(i); diff --git a/clang/test/CoverageMapping/statement-expression.c b/clang/test/CoverageMapping/statement-expression.c new file mode 100644 index 00000000000000..5f9ab5838af342 --- /dev/null +++ b/clang/test/CoverageMapping/statement-expression.c @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name statement-expression.c %s + +// No crash for the following examples, where GNU Statement Expression extension +// could introduce region terminators (break, goto etc) before implicit +// initializers in a struct or an array. +// See https://github.com/llvm/llvm-project/pull/89564 + +struct Foo { + int field1; + int field2; +}; + +void f1(void) { + struct Foo foo = { + .field1 = ({ + switch (0) { + case 0: + break; // A region terminator + } + 0; + }), + // ImplicitValueInitExpr introduced here for .field2 + }; +} + +void f2(void) { + int arr[3] = { + [0] = ({ + goto L0; // A region terminator +L0: + 0; + }), + // ImplicitValueInitExpr introduced here for subscript [1] + [2] = 0, + }; +} diff --git a/clang/test/CoverageMapping/templates.cpp b/clang/test/CoverageMapping/templates.cpp index 7010edbc32c34a..143e566a33cb85 100644 --- a/clang/test/CoverageMapping/templates.cpp +++ b/clang/test/CoverageMapping/templates.cpp @@ -19,3 +19,16 @@ int main() { func(true); return 0; } + +namespace structural_value_crash { + template + void tpl_fn() { + (void)p; + } + + int arr[] = {1, 2, 3}; + + void test() { + tpl_fn(); + } +} diff --git a/clang/test/Driver/aarch64-cssc.c b/clang/test/Driver/aarch64-cssc.c index a3e18663279bbd..5df0ea79d7c850 100644 --- a/clang/test/Driver/aarch64-cssc.c +++ b/clang/test/Driver/aarch64-cssc.c @@ -9,6 +9,7 @@ // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a %s 2>&1 | FileCheck %s // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+cssc %s 2>&1 | FileCheck %s // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+nocssc %s 2>&1 | FileCheck %s --check-prefix=NO_CSSC +// RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -mcpu=ampere1b %s 2>&1 | FileCheck %s // CHECK: "target-features"="{{.*}},+cssc // NO_CSSC: "target-features"="{{.*}},-cssc diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c index 511482a420da26..3e07f3597f3408 100644 --- a/clang/test/Driver/aarch64-mcpu.c +++ b/clang/test/Driver/aarch64-mcpu.c @@ -72,6 +72,9 @@ // RUN: %clang --target=aarch64 -mcpu=cortex-r82 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEXR82 %s // CORTEXR82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-r82" +// RUN: %clang --target=aarch64 -mcpu=cobalt-100 -### -c %s 2>&1 | FileCheck -check-prefix=COBALT-100 %s +// COBALT-100: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n2" + // RUN: %clang --target=aarch64 -mcpu=grace -### -c %s 2>&1 | FileCheck -check-prefix=GRACE %s // GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v2" diff --git a/clang/test/Driver/fat-lto-objects.c b/clang/test/Driver/fat-lto-objects.c index 97002db6edc51e..d9a5ba88ea6d6f 100644 --- a/clang/test/Driver/fat-lto-objects.c +++ b/clang/test/Driver/fat-lto-objects.c @@ -23,11 +23,17 @@ // CHECK-CC-S-EL-LTO-SAME: -emit-llvm // CHECK-CC-S-EL-LTO-SAME: -ffat-lto-objects -/// When fat LTO is enabled wihtout -S we expect native object output and -ffat-lto-object to be passed to cc1. +/// When fat LTO is enabled without -S we expect native object output and -ffat-lto-object to be passed to cc1. // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-LTO // CHECK-CC-C-LTO: -cc1 -// CHECK-CC-C-LTO: -emit-obj -// CHECK-CC-C-LTO: -ffat-lto-objects +// CHECK-CC-C-LTO-SAME: -emit-obj +// CHECK-CC-C-LTO-SAME: -ffat-lto-objects + +/// When fat LTO is enabled with -c and -emit-llvm we expect bitcode output and -ffat-lto-object to be passed to cc1. +// RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c -emit-llvm 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-EL-LTO +// CHECK-CC-C-EL-LTO: -cc1 +// CHECK-CC-C-EL-LTO-SAME: -emit-llvm-bc +// CHECK-CC-C-EL-LTO-SAME: -ffat-lto-objects /// Make sure we don't have a warning for -ffat-lto-objects being unused // RUN: %clang --target=x86_64-unknown-linux-gnu -ffat-lto-objects -fdriver-only -Werror -v %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-NOLTO diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c index e2a7619e9b89f7..8a230284bcdfe4 100644 --- a/clang/test/Driver/fveclib.c +++ b/clang/test/Driver/fveclib.c @@ -31,3 +31,21 @@ // RUN: %clang -fveclib=Accelerate %s -nodefaultlibs -target arm64-apple-ios8.0.0 -### 2>&1 | FileCheck --check-prefix=CHECK-LINK-NODEFAULTLIBS %s // CHECK-LINK-NODEFAULTLIBS-NOT: "-framework" "Accelerate" + + +/* Verify that the correct vector library is passed to LTO flags. */ + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-LIBMVEC %s +// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86" + +// RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-MASSV %s +// CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=SVML -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SVML %s +// CHECK-LTO-SVML: "-plugin-opt=-vector-library=SVML" + +// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=SLEEF -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SLEEF %s +// CHECK-LTO-SLEEF: "-plugin-opt=-vector-library=sleefgnuabi" + +// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-ARMPL %s +// CHECK-LTO-ARMPL: "-plugin-opt=-vector-library=ArmPL" diff --git a/clang/test/Driver/modules-skip-odr-check-in-gmf.cpp b/clang/test/Driver/modules-skip-odr-check-in-gmf.cpp new file mode 100644 index 00000000000000..b00b6d330ba459 --- /dev/null +++ b/clang/test/Driver/modules-skip-odr-check-in-gmf.cpp @@ -0,0 +1,10 @@ +// RUN: %clang -std=c++20 -### -c %s 2>&1 | FileCheck %s +// RUN: %clang -std=c++20 -fno-skip-odr-check-in-gmf -### -c %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=UNUSED +// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf -### -c %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=NO-SKIP + +// CHECK: -fskip-odr-check-in-gmf +// UNUSED: warning: argument unused during compilation: '-fno-skip-odr-check-in-gmf' +// UNUSED-NOT: -fno-skip-odr-check-in-gmf +// NO-SKIP: -fskip-odr-check-in-gmf{{.*}}-fno-skip-odr-check-in-gmf diff --git a/clang/test/Driver/sparc-fixed-register.c b/clang/test/Driver/sparc-fixed-register.c new file mode 100644 index 00000000000000..24880b9c9d86fd --- /dev/null +++ b/clang/test/Driver/sparc-fixed-register.c @@ -0,0 +1,181 @@ +// RUN: %clang --target=sparc-none-gnu -ffixed-g1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G1 < %t %s +// CHECK-FIXED-G1: "-target-feature" "+reserve-g1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G2 < %t %s +// CHECK-FIXED-G2: "-target-feature" "+reserve-g2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G3 < %t %s +// CHECK-FIXED-G3: "-target-feature" "+reserve-g3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G4 < %t %s +// CHECK-FIXED-G4: "-target-feature" "+reserve-g4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G5 < %t %s +// CHECK-FIXED-G5: "-target-feature" "+reserve-g5" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g6 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G6 < %t %s +// CHECK-FIXED-G6: "-target-feature" "+reserve-g6" + +// RUN: %clang --target=sparc-none-gnu -ffixed-g7 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-G7 < %t %s +// CHECK-FIXED-G7: "-target-feature" "+reserve-g7" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o0 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O0 < %t %s +// CHECK-FIXED-O0: "-target-feature" "+reserve-o0" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O1 < %t %s +// CHECK-FIXED-O1: "-target-feature" "+reserve-o1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O2 < %t %s +// CHECK-FIXED-O2: "-target-feature" "+reserve-o2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O3 < %t %s +// CHECK-FIXED-O3: "-target-feature" "+reserve-o3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O4 < %t %s +// CHECK-FIXED-O4: "-target-feature" "+reserve-o4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-o5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-O5 < %t %s +// CHECK-FIXED-O5: "-target-feature" "+reserve-o5" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l0 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L0 < %t %s +// CHECK-FIXED-L0: "-target-feature" "+reserve-l0" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L1 < %t %s +// CHECK-FIXED-L1: "-target-feature" "+reserve-l1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L2 < %t %s +// CHECK-FIXED-L2: "-target-feature" "+reserve-l2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L3 < %t %s +// CHECK-FIXED-L3: "-target-feature" "+reserve-l3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L4 < %t %s +// CHECK-FIXED-L4: "-target-feature" "+reserve-l4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L5 < %t %s +// CHECK-FIXED-L5: "-target-feature" "+reserve-l5" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l6 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L6 < %t %s +// CHECK-FIXED-L6: "-target-feature" "+reserve-l6" + +// RUN: %clang --target=sparc-none-gnu -ffixed-l7 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-L7 < %t %s +// CHECK-FIXED-L7: "-target-feature" "+reserve-l7" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i0 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I0 < %t %s +// CHECK-FIXED-I0: "-target-feature" "+reserve-i0" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i1 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I1 < %t %s +// CHECK-FIXED-I1: "-target-feature" "+reserve-i1" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i2 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I2 < %t %s +// CHECK-FIXED-I2: "-target-feature" "+reserve-i2" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i3 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I3 < %t %s +// CHECK-FIXED-I3: "-target-feature" "+reserve-i3" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i4 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I4 < %t %s +// CHECK-FIXED-I4: "-target-feature" "+reserve-i4" + +// RUN: %clang --target=sparc-none-gnu -ffixed-i5 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-FIXED-I5 < %t %s +// CHECK-FIXED-I5: "-target-feature" "+reserve-i5" + +// Test multiple of reserve-* options together. +// RUN: %clang --target=sparc-none-gnu \ +// RUN: -ffixed-g1 \ +// RUN: -ffixed-o2 \ +// RUN: -ffixed-l3 \ +// RUN: -ffixed-i4 \ +// RUN: -### %s 2> %t +// RUN: FileCheck \ +// RUN: --check-prefix=CHECK-FIXED-G1 \ +// RUN: --check-prefix=CHECK-FIXED-O2 \ +// RUN: --check-prefix=CHECK-FIXED-L3 \ +// RUN: --check-prefix=CHECK-FIXED-I4 \ +// RUN: < %t %s + +// Test all reserve-* options together. +// RUN: %clang --target=sparc-none-gnu \ +// RUN: -ffixed-g1 \ +// RUN: -ffixed-g2 \ +// RUN: -ffixed-g3 \ +// RUN: -ffixed-g4 \ +// RUN: -ffixed-g5 \ +// RUN: -ffixed-g6 \ +// RUN: -ffixed-g7 \ +// RUN: -ffixed-o0 \ +// RUN: -ffixed-o1 \ +// RUN: -ffixed-o2 \ +// RUN: -ffixed-o3 \ +// RUN: -ffixed-o4 \ +// RUN: -ffixed-o5 \ +// RUN: -ffixed-l0 \ +// RUN: -ffixed-l1 \ +// RUN: -ffixed-l2 \ +// RUN: -ffixed-l3 \ +// RUN: -ffixed-l4 \ +// RUN: -ffixed-l5 \ +// RUN: -ffixed-l6 \ +// RUN: -ffixed-l7 \ +// RUN: -ffixed-i0 \ +// RUN: -ffixed-i1 \ +// RUN: -ffixed-i2 \ +// RUN: -ffixed-i3 \ +// RUN: -ffixed-i4 \ +// RUN: -ffixed-i5 \ +// RUN: -### %s 2> %t +// RUN: FileCheck \ +// RUN: --check-prefix=CHECK-FIXED-G1 \ +// RUN: --check-prefix=CHECK-FIXED-G2 \ +// RUN: --check-prefix=CHECK-FIXED-G3 \ +// RUN: --check-prefix=CHECK-FIXED-G4 \ +// RUN: --check-prefix=CHECK-FIXED-G5 \ +// RUN: --check-prefix=CHECK-FIXED-G6 \ +// RUN: --check-prefix=CHECK-FIXED-G7 \ +// RUN: --check-prefix=CHECK-FIXED-O0 \ +// RUN: --check-prefix=CHECK-FIXED-O1 \ +// RUN: --check-prefix=CHECK-FIXED-O2 \ +// RUN: --check-prefix=CHECK-FIXED-O3 \ +// RUN: --check-prefix=CHECK-FIXED-O4 \ +// RUN: --check-prefix=CHECK-FIXED-O5 \ +// RUN: --check-prefix=CHECK-FIXED-L0 \ +// RUN: --check-prefix=CHECK-FIXED-L1 \ +// RUN: --check-prefix=CHECK-FIXED-L2 \ +// RUN: --check-prefix=CHECK-FIXED-L3 \ +// RUN: --check-prefix=CHECK-FIXED-L4 \ +// RUN: --check-prefix=CHECK-FIXED-L5 \ +// RUN: --check-prefix=CHECK-FIXED-L6 \ +// RUN: --check-prefix=CHECK-FIXED-L7 \ +// RUN: --check-prefix=CHECK-FIXED-I0 \ +// RUN: --check-prefix=CHECK-FIXED-I1 \ +// RUN: --check-prefix=CHECK-FIXED-I2 \ +// RUN: --check-prefix=CHECK-FIXED-I3 \ +// RUN: --check-prefix=CHECK-FIXED-I4 \ +// RUN: --check-prefix=CHECK-FIXED-I5 \ +// RUN: < %t %s diff --git a/clang/test/Driver/sparc64-codemodel.c b/clang/test/Driver/sparc64-codemodel.c new file mode 100644 index 00000000000000..e4b01fd61b6fac --- /dev/null +++ b/clang/test/Driver/sparc64-codemodel.c @@ -0,0 +1,6 @@ +// RUN: %clang --target=sparc64 -mcmodel=medlow %s -### 2>&1 | FileCheck -check-prefix=MEDLOW %s +// RUN: %clang --target=sparc64 -mcmodel=medmid %s -### 2>&1 | FileCheck -check-prefix=MEDMID %s +// RUN: %clang --target=sparc64 -mcmodel=medany %s -### 2>&1 | FileCheck -check-prefix=MEDANY %s +// MEDLOW: "-mcmodel=small" +// MEDMID: "-mcmodel=medium" +// MEDANY: "-mcmodel=large" diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c new file mode 100644 index 00000000000000..4e105ce3cea5d9 --- /dev/null +++ b/clang/test/Driver/tls-dialect.c @@ -0,0 +1,25 @@ +// RUN: %clang -### --target=riscv64-freebsd -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=DESC %s +// RUN: %clang -### --target=riscv64-linux -mtls-dialect=trad %s 2>&1 | FileCheck --check-prefix=NODESC %s +// RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s +// RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s + +/// LTO +// RUN: %clang -### --target=riscv64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s +// RUN: %clang -### --target=riscv64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s + +/// Unsupported target +/// GCC supports -mtls-dialect= for AArch64, but we just unsupport it for AArch64 as it is very rarely used. +// RUN: not %clang --target=aarch64-linux -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-TARGET %s +// RUN: not %clang --target=x86_64-apple-macos -mtls-dialect=desc -flto %s 2>&1 | FileCheck -check-prefix=UNSUPPORTED-TARGET %s + +/// Unsupported argument +// RUN: not %clang -### --target=riscv64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s +// RUN: not %clang -### --target=x86_64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s + +// DESC: "-cc1" {{.*}}"-enable-tlsdesc" +// NODESC-NOT: "-enable-tlsdesc" +// LTO-DESC: "-plugin-opt=-enable-tlsdesc" +// LTO-NODESC-NOT: "-plugin-opt=-enable-tlsdesc" + +// UNSUPPORTED-TARGET: error: unsupported option '-mtls-dialect=' for target +// UNSUPPORTED-ARG: error: unsupported argument 'gnu2' to option '-mtls-dialect=' for target diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c index f950283ec42aa0..88590a3ba4c453 100644 --- a/clang/test/Driver/wasm-toolchain.c +++ b/clang/test/Driver/wasm-toolchain.c @@ -197,3 +197,27 @@ // RUN: not %clang -### %s --target=wasm32-unknown-unknown --sysroot=%s/no-sysroot-there -fPIC -mno-mutable-globals %s 2>&1 \ // RUN: | FileCheck -check-prefix=PIC_NO_MUTABLE_GLOBALS %s // PIC_NO_MUTABLE_GLOBALS: error: invalid argument '-fPIC' not allowed with '-mno-mutable-globals' + +// Test that `wasm32-wasip2` invokes the `wasm-component-ld` linker by default +// instead of `wasm-ld`. + +// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \ +// RUN: | FileCheck -check-prefix=LINK_WASIP2 %s +// LINK_WASIP2: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]" +// LINK_WASIP2: wasm-component-ld{{.*}}" "-L/foo/lib/wasm32-wasip2" "crt1.o" "[[temp]]" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out" + +// Test that on `wasm32-wasip2` the `wasm-component-ld` programs is told where +// to find `wasm-ld` by default. + +// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \ +// RUN: | FileCheck -check-prefix=LINK_WASIP2_FIND_WASMLD %s +// LINK_WASIP2_FIND_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]" +// LINK_WASIP2_FIND_WASMLD: wasm-component-ld{{.*}}" {{.*}} "--wasm-ld-path" "{{.*}}wasm-ld{{.*}}" {{.*}} "[[temp]]" {{.*}} + +// If `wasm32-wasip2` is configured with `wasm-ld` as a linker then don't pass +// the `--wasm-ld-path` flag. + +// RUN: %clang -### -O2 --target=wasm32-wasip2 -fuse-ld=lld %s --sysroot /foo 2>&1 \ +// RUN: | FileCheck -check-prefix=LINK_WASIP2_USE_WASMLD %s +// LINK_WASIP2_USE_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]" +// LINK_WASIP2_USE_WASMLD: wasm-ld{{.*}}" "-m" "wasm32" {{.*}} "[[temp]]" {{.*}} diff --git a/clang/test/Format/clang-format-ignore.cpp b/clang/test/Format/clang-format-ignore.cpp index b4e526463000ae..fb49fa9dd52c65 100644 --- a/clang/test/Format/clang-format-ignore.cpp +++ b/clang/test/Format/clang-format-ignore.cpp @@ -1,4 +1,3 @@ -// UNSUPPORTED: system-windows // RUN: rm -rf %t.dir // RUN: mkdir -p %t.dir/level1/level2 diff --git a/clang/test/Format/dump-config-objc-stdin.m b/clang/test/Format/dump-config-objc-stdin.m index b22ff7b3328caa..d81711a84d79bf 100644 --- a/clang/test/Format/dump-config-objc-stdin.m +++ b/clang/test/Format/dump-config-objc-stdin.m @@ -1,5 +1,8 @@ +// RUN: clang-format -assume-filename=foo.m -dump-config | FileCheck %s + // RUN: clang-format -dump-config - < %s | FileCheck %s // CHECK: Language: ObjC + @interface Foo @end diff --git a/clang/test/Format/verbose.cpp b/clang/test/Format/verbose.cpp index dd625e3f67e55d..4ab03d8f62aefc 100644 --- a/clang/test/Format/verbose.cpp +++ b/clang/test/Format/verbose.cpp @@ -1,12 +1,6 @@ -// RUN: clang-format %s 2> %t.stderr +// RUN: clang-format -verbose 2> %t.stderr // RUN: not grep "Formatting" %t.stderr -// RUN: clang-format %s -verbose 2> %t.stderr -// RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr -// RUN: clang-format %s -verbose=false 2> %t.stderr -// RUN: not grep "Formatting" %t.stderr - -int a; -// RUN: clang-format %s 2> %t.stderr +// RUN: clang-format %s 2> %t.stderr // RUN: not grep "Formatting" %t.stderr // RUN: clang-format %s -verbose 2> %t.stderr // RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index 84aed5c9c36fe4..39ed02f50950dd 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -5,11 +5,11 @@ // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64 // AARCH64: error: unknown target CPU 'not-a-cpu' -// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, grace{{$}} +// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}} // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu' -// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, grace{{$}} +// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}} // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86 // X86: error: unknown target CPU 'not-a-cpu' diff --git a/clang/test/Modules/concept.cppm b/clang/test/Modules/concept.cppm index 0e85a46411a544..0fdb5ea8968085 100644 --- a/clang/test/Modules/concept.cppm +++ b/clang/test/Modules/concept.cppm @@ -5,6 +5,12 @@ // RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm // RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t -I%t -DDIFFERENT %t/B.cppm -verify // RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t -I%t %t/B.cppm -verify +// +// Testing the behavior of `-fskip-odr-check-in-gmf` +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf -fprebuilt-module-path=%t -I%t \ +// RUN: -DDIFFERENT -DSKIP_ODR_CHECK_IN_GMF %t/B.cppm -verify + //--- foo.h #ifndef FOO_H @@ -70,7 +76,10 @@ module; export module B; import A; -#ifdef DIFFERENT +#ifdef SKIP_ODR_CHECK_IN_GMF +// expected-error@B.cppm:* {{call to object of type '__fn' is ambiguous}} +// expected-note@* 1+{{candidate function}} +#elif defined(DIFFERENT) // expected-error@foo.h:41 {{'__fn::operator()' from module 'A.' is not present in definition of '__fn' provided earlier}} // expected-note@* 1+{{declaration of 'operator()' does not match}} #else diff --git a/clang/test/Modules/cxx20-modules-enum-odr.cppm b/clang/test/Modules/cxx20-modules-enum-odr.cppm new file mode 100644 index 00000000000000..831c01143a27ba --- /dev/null +++ b/clang/test/Modules/cxx20-modules-enum-odr.cppm @@ -0,0 +1,51 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/mod1.cppm -emit-module-interface -o %t/mod1.pcm +// RUN: %clang_cc1 -std=c++20 %t/mod2.cppm -emit-module-interface -o %t/mod2.pcm +// RUN: %clang_cc1 -std=c++20 %t/test.cpp -fprebuilt-module-path=%t -verify -fsyntax-only + +//--- size_t.h + +extern "C" { + typedef unsigned int size_t; +} + +//--- csize_t +namespace std { + using :: size_t; +} + +//--- align.h +namespace std { + enum class align_val_t : size_t {}; +} + +//--- mod1.cppm +module; +#include "size_t.h" +#include "align.h" +export module mod1; +namespace std { +export using std::align_val_t; +} + +//--- mod2.cppm +module; +#include "size_t.h" +#include "csize_t" +#include "align.h" +export module mod2; +namespace std { +export using std::align_val_t; +} + +//--- test.cpp +// expected-no-diagnostics +import mod1; +import mod2; +void test() { + std::align_val_t v; +} + diff --git a/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm b/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm new file mode 100644 index 00000000000000..8db53c0ace8796 --- /dev/null +++ b/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm @@ -0,0 +1,67 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/B.cppm -emit-module-interface -o %t/B.pcm +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/test.cpp -fprebuilt-module-path=%t -fsyntax-only -verify + +//--- header.h +#pragma once +template +class Optional {}; + +template +concept C = requires(const _Tp& __t) { + [](const Optional<_Up>&) {}(__t); +}; + +//--- func.h +#include "header.h" +template +void func() {} + +//--- duplicated_func.h +#include "header.h" +template +void duplicated_func() {} + +//--- test_func.h +#include "func.h" + +void test_func() { + func>(); +} + +//--- test_duplicated_func.h +#include "duplicated_func.h" + +void test_duplicated_func() { + duplicated_func>(); +} + +//--- A.cppm +module; +#include "header.h" +#include "test_duplicated_func.h" +export module A; +export using ::test_duplicated_func; + +//--- B.cppm +module; +#include "header.h" +#include "test_func.h" +#include "test_duplicated_func.h" +export module B; +export using ::test_func; +export using ::test_duplicated_func; + +//--- test.cpp +// expected-no-diagnostics +import A; +import B; + +void test() { + test_func(); + test_duplicated_func(); +} diff --git a/clang/test/Modules/no-eager-load.cppm b/clang/test/Modules/no-eager-load.cppm index 6632cc60c8eb84..8a2c7656bca2b4 100644 --- a/clang/test/Modules/no-eager-load.cppm +++ b/clang/test/Modules/no-eager-load.cppm @@ -9,19 +9,10 @@ // RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/d.cpp \ // RUN: -fprebuilt-module-path=%t // -// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/e.cppm -o %t/e.pcm -// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/f.cppm -o %t/f.pcm -// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/g.cpp \ -// RUN: -fprebuilt-module-path=%t -// // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/h.cppm \ // RUN: -fprebuilt-module-path=%t -o %t/h.pcm -// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/i.cppm \ -// RUN: -fprebuilt-module-path=%t -o %t/i.pcm // RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/j.cpp \ // RUN: -fprebuilt-module-path=%t -// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/k.cpp \ -// RUN: -fprebuilt-module-path=%t //--- a.cppm export module a; @@ -53,58 +44,14 @@ void use() { // expected-note@* {{but in 'a' found a different body}} } -//--- foo.h -void foo() { - -} - -//--- bar.h -void bar(); -void foo() { - bar(); -} - -//--- e.cppm -module; -#include "foo.h" -export module e; -export using ::foo; - -//--- f.cppm -module; -#include "bar.h" -export module f; -export using ::foo; - -//--- g.cpp -import e; -import f; -void use() { - foo(); // expected-error@* {{'foo' has different definitions in different modules;}} - // expected-note@* {{but in 'e.' found a different body}} -} - //--- h.cppm export module h; export import a; export import b; -//--- i.cppm -export module i; -export import e; -export import f; - //--- j.cpp import h; void use() { foo(); // expected-error@* {{'foo' has different definitions in different modules;}} // expected-note@* {{but in 'a' found a different body}} } - -//--- k.cpp -import i; -void use() { - foo(); // expected-error@* {{'foo' has different definitions in different modules;}} - // expected-note@* {{but in 'e.' found a different body}} -} - diff --git a/clang/test/Modules/no-undeclared-includes-builtins.cpp b/clang/test/Modules/no-undeclared-includes-builtins.cpp index c9bffc55619905..f9eefd24a33c7d 100644 --- a/clang/test/Modules/no-undeclared-includes-builtins.cpp +++ b/clang/test/Modules/no-undeclared-includes-builtins.cpp @@ -8,7 +8,7 @@ // headers. // RUN: rm -rf %t -// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -I %S/Inputs/no-undeclared-includes-builtins/libcxx -I %S/Inputs/no-undeclared-includes-builtins/glibc %s +// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fbuiltin-headers-in-system-modules -fimplicit-module-maps -I %S/Inputs/no-undeclared-includes-builtins/libcxx -I %S/Inputs/no-undeclared-includes-builtins/glibc %s // expected-no-diagnostics #include diff --git a/clang/test/Modules/polluted-operator.cppm b/clang/test/Modules/polluted-operator.cppm index b24464aa6ad21e..721ca061c939f4 100644 --- a/clang/test/Modules/polluted-operator.cppm +++ b/clang/test/Modules/polluted-operator.cppm @@ -4,6 +4,12 @@ // // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.cppm -o %t/a.pcm // RUN: %clang_cc1 -std=c++20 %t/b.cppm -fprebuilt-module-path=%t -emit-module-interface -o %t/b.pcm -verify +// +// Testing the behavior of `-fskip-odr-check-in-gmf` +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf -emit-module-interface %t/a.cppm -o \ +// RUN: %t/a.pcm +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/b.cppm -fprebuilt-module-path=%t \ +// RUN: -emit-module-interface -DSKIP_ODR_CHECK_IN_GMF -o %t/b.pcm -verify //--- foo.h @@ -51,7 +57,11 @@ module; export module b; import a; +#ifdef SKIP_ODR_CHECK_IN_GMF +// expected-no-diagnostics +#else // expected-error@* {{has different definitions in different modules; first difference is defined here found data member '_S_copy_ctor' with an initializer}} // expected-note@* {{but in 'a.' found data member '_S_copy_ctor' with a different initializer}} // expected-error@* {{from module 'a.' is not present in definition of 'variant<_Types...>' provided earlier}} // expected-note@* {{declaration of 'swap' does not match}} +#endif diff --git a/clang/test/Modules/pr76638.cppm b/clang/test/Modules/pr76638.cppm index 8cc807961421b7..e4820ba3d79d96 100644 --- a/clang/test/Modules/pr76638.cppm +++ b/clang/test/Modules/pr76638.cppm @@ -10,6 +10,12 @@ // RUN: %clang_cc1 -std=c++20 %t/mod4.cppm -fmodule-file=mod3=%t/mod3.pcm \ // RUN: -fsyntax-only -verify +// Testing the behavior of `-fskip-odr-check-in-gmf` +// RUN: %clang_cc1 -std=c++20 %t/mod3.cppm -fskip-odr-check-in-gmf \ +// RUN: -emit-module-interface -o %t/mod3.pcm +// RUN: %clang_cc1 -std=c++20 %t/mod4.cppm -fmodule-file=mod3=%t/mod3.pcm \ +// RUN: -fskip-odr-check-in-gmf -DSKIP_ODR_CHECK_IN_GMF -fsyntax-only -verify + //--- size_t.h extern "C" { @@ -65,5 +71,9 @@ export module mod4; import mod3; export using std::align_val_t; +#ifdef SKIP_ODR_CHECK_IN_GMF +// expected-no-diagnostics +#else // expected-error@align.h:* {{'std::align_val_t' has different definitions in different modules; defined here first difference is enum with specified type 'size_t' (aka 'int')}} // expected-note@align.h:* {{but in 'mod3.' found enum with specified type 'size_t' (aka 'unsigned int')}} +#endif diff --git a/clang/test/Modules/skip-odr-check-in-gmf.cppm b/clang/test/Modules/skip-odr-check-in-gmf.cppm new file mode 100644 index 00000000000000..3ee7d09224bfa2 --- /dev/null +++ b/clang/test/Modules/skip-odr-check-in-gmf.cppm @@ -0,0 +1,56 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// Baseline testing to make sure we can detect the ODR violation from the CC1 invocation. +// RUNX: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm +// RUNX: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.pcm +// RUNX: %clang_cc1 -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -verify +// +// Testing that we can ignore the ODR violation from the driver invocation. +// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm +// RUN: %clang -std=c++20 %t/b.cppm --precompile -o %t/b.pcm +// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify \ +// RUN: -DIGNORE_ODR_VIOLATION +// +// Testing that the driver can require to check the ODR violation. +// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf %t/a.cppm --precompile -o %t/a.pcm +// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf %t/b.cppm --precompile -o %t/b.pcm +// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf %t/test.cc -fprebuilt-module-path=%t \ +// RUN: -fsyntax-only -Xclang -verify + +//--- func1.h +bool func(int x, int y) { + return true; +} + +//--- func2.h +bool func(int x, int y) { + return false; +} + +//--- a.cppm +module; +#include "func1.h" +export module a; +export using ::func; + +//--- b.cppm +module; +#include "func2.h" +export module b; +export using ::func; + +//--- test.cc +import a; +import b; +bool test() { + return func(1, 2); +} + +#ifdef IGNORE_ODR_VIOLATION +// expected-no-diagnostics +#else +// expected-error@func2.h:1 {{'func' has different definitions in different modules;}} +// expected-note@func1.h:1 {{but in 'a.' found a different body}} +#endif diff --git a/clang/test/Modules/stddef.c b/clang/test/Modules/stddef.c index 5bc0d1e44c8563..76239826146810 100644 --- a/clang/test/Modules/stddef.c +++ b/clang/test/Modules/stddef.c @@ -1,29 +1,33 @@ // RUN: rm -rf %t -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery +// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify=builtin-headers-in-system-modules -fno-modules-error-recovery // RUN: rm -rf %t -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery +// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify=no-builtin-headers-in-system-modules -fno-modules-error-recovery #include "ptrdiff_t.h" ptrdiff_t pdt; -// size_t is declared in both size_t.h and __stddef_size_t.h, both of which are -// modular headers. Regardless of whether stddef.h joins the StdDef test module -// or is in its _Builtin_stddef module, __stddef_size_t.h will be in -// _Builtin_stddef.size_t. It's not defined which module will win as the expected -// provider of size_t. For the purposes of this test it doesn't matter which header -// gets reported, just as long as it isn't other.h or include_again.h. -size_t st; // expected-error-re {{missing '#include "{{size_t|__stddef_size_t}}.h"'; 'size_t' must be declared before it is used}} -// expected-note@size_t.h:* 0+ {{here}} -// expected-note@__stddef_size_t.h:* 0+ {{here}} +// size_t is declared in both size_t.h and __stddef_size_t.h. If +// -fbuiltin-headers-in-system-modules is set, then __stddef_size_t.h is a +// non-modular header that will be transitively pulled in the StdDef test module +// by include_again.h. Otherwise it will be in the _Builtin_stddef module. In +// any case it's not defined which module will win as the expected provider of +// size_t. For the purposes of this test it doesn't matter which of the two +// providing headers get reported. +size_t st; // builtin-headers-in-system-modules-error-re {{missing '#include "{{size_t|include_again}}.h"'; 'size_t' must be declared before it is used}} \ + no-builtin-headers-in-system-modules-error-re {{missing '#include "{{size_t|__stddef_size_t}}.h"'; 'size_t' must be declared before it is used}} +// builtin-headers-in-system-modules-note@size_t.h:* 0+ {{here}} \ + no-builtin-headers-in-system-modules-note@size_t.h:* 0+ {{here}} +// builtin-headers-in-system-modules-note@__stddef_size_t.h:* 0+ {{here}} \ + no-builtin-headers-in-system-modules-note@__stddef_size_t.h:* 0+ {{here}} #include "include_again.h" -// Includes which includes <__stddef_size_t.h> which imports the -// _Builtin_stddef.size_t module. +// Includes which includes <__stddef_size_t.h>. size_t st2; #include "size_t.h" -// Redeclares size_t, but the type merger should figure it out. +// Redeclares size_t when -fbuiltin-headers-in-system-modules is not passed, but +// the type merger should figure it out. size_t st3; diff --git a/clang/test/OpenMP/bug54082.c b/clang/test/OpenMP/bug54082.c index b88b68fd43012a..337c120983e0a3 100644 --- a/clang/test/OpenMP/bug54082.c +++ b/clang/test/OpenMP/bug54082.c @@ -69,9 +69,7 @@ void foo() { // CHECK-NEXT: [[X_TRAITS:%.*]] = alloca [1 x %struct.omp_alloctrait_t], align 16 // CHECK-NEXT: [[X_ALLOC:%.*]] = alloca i64, align 8 // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[X_TRAITS]]) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: store i32 2, ptr [[X_TRAITS]], align 16 -// CHECK-NEXT: [[LOC0:%.*]] = getelementptr inbounds i8, ptr [[X_TRAITS]], i64 8 -// CHECK-NEXT: store i64 64, ptr [[LOC0]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) [[X_TRAITS]], ptr noundef nonnull align 16 dereferenceable(16) @__const.foo.x_traits, i64 16, i1 false) // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[X_ALLOC]]) #[[ATTR5]] // CHECK-NEXT: [[CALL:%.*]] = call i64 @omp_init_allocator(i64 noundef 0, i32 noundef 1, ptr noundef nonnull [[X_TRAITS]]) #[[ATTR5]] // CHECK-NEXT: store i64 [[CALL]], ptr [[X_ALLOC]], align 8, !tbaa [[TBAA3:![0-9]+]] diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 15879da04fcf0e..1e9aec3fdf2373 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -60,6 +60,10 @@ // CHECK-NOT: __ARM_FEATURE_SVE_BITS 512 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 1024 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 2048 +// CHECK: __ARM_STATE_ZA 1 +// CHECK: __ARM_STATE_ZT0 1 +// CHECK-NOT: __ARM_FEATURE_SME +// CHECK-NOT: __ARM_FEATURE_SME2 // RUN: %clang -target aarch64-none-elf -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE // RUN: %clang -target arm64-none-linux-gnu -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE @@ -314,15 +318,15 @@ // CHECK-MCPU-APPLE-A7: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes"{{.*}} "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-APPLE-A10: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes"{{.*}} "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-APPLE-A11: "-cc1"{{.*}} "-triple" "aarch64{{.*}}"{{.*}}"-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" -// CHECK-MCPU-APPLE-A12: "-cc1"{{.*}} "-triple" "aarch64"{{.*}} "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" +// CHECK-MCPU-APPLE-A12: "-cc1"{{.*}} "-triple" "aarch64"{{.*}} "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-A34: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" -// CHECK-MCPU-APPLE-A13: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "apple-a13" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon" +// CHECK-MCPU-APPLE-A13: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "apple-a13" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon" // CHECK-MCPU-A35: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-A53: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" -// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+neon" "-target-feature" "+ssbs" +// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+neon" "-target-feature" "+ssbs" // CHECK-MCPU-M3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-M4: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" // CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon" @@ -331,10 +335,10 @@ // CHECK-MCPU-CARMEL: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" // RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s -// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.5a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon" +// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.5a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon" // RUN: %clang -target x86_64-apple-macosx -arch arm64_32 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64_32 %s -// CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" +// CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon" // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s @@ -497,9 +501,10 @@ // CHECK-MEMTAG: __ARM_FEATURE_MEMORY_TAGGING 1 // ================== Check Pointer Authentication Extension (PAuth). -// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s -// RUN: %clang -target arm64-none-linux-gnu -march=armv8.5-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s -// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-ON %s +// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-NOPAUTH %s +// RUN: %clang -target arm64-none-linux-gnu -march=armv8.5-a+nopauth -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-NOPAUTH %s +// RUN: %clang -target arm64-none-linux-gnu -march=armv8.5-a -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-PAUTH %s +// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-PAUTH %s // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=bti -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=standard -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH %s @@ -507,12 +512,18 @@ // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=pac-ret+b-key -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-BKEY %s // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=pac-ret+leaf -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-ALL %s // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=pac-ret+leaf+b-key -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-BKEY-ALL %s -// CHECK-PAUTH-OFF-NOT: __ARM_FEATURE_PAC_DEFAULT -// CHECK-PAUTH: #define __ARM_FEATURE_PAC_DEFAULT 1 -// CHECK-PAUTH-BKEY: #define __ARM_FEATURE_PAC_DEFAULT 2 -// CHECK-PAUTH-ALL: #define __ARM_FEATURE_PAC_DEFAULT 5 -// CHECK-PAUTH-BKEY-ALL: #define __ARM_FEATURE_PAC_DEFAULT 6 -// CHECK-PAUTH-ON: #define __ARM_FEATURE_PAUTH 1 +// +// Note: PAUTH-OFF - pac-ret is disabled +// CPU-NOPAUTH - FEAT_PAUTH support is disabled (but pac-ret can still use HINT-encoded instructions) +// +// CHECK-CPU-NOPAUTH-NOT: __ARM_FEATURE_PAUTH +// CHECK-PAUTH-OFF-NOT: __ARM_FEATURE_PAC_DEFAULT +// CHECK-PAUTH: #define __ARM_FEATURE_PAC_DEFAULT 1 +// CHECK-PAUTH-BKEY: #define __ARM_FEATURE_PAC_DEFAULT 2 +// CHECK-PAUTH-ALL: #define __ARM_FEATURE_PAC_DEFAULT 5 +// CHECK-PAUTH-BKEY-ALL: #define __ARM_FEATURE_PAC_DEFAULT 6 +// CHECK-CPU-PAUTH: #define __ARM_FEATURE_PAUTH 1 +// CHECK-CPU-NOPAUTH-NOT: __ARM_FEATURE_PAUTH // ================== Check Branch Target Identification (BTI). // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-BTI-OFF %s @@ -634,3 +645,12 @@ // RUN: %clang --target=aarch64 -march=armv8.2-a+rcpc3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-RCPC3 %s // CHECK-RCPC3: __ARM_FEATURE_RCPC 3 + +// RUN: %clang --target=aarch64 -march=armv9-a+sme -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME %s +// CHECK-SME: __ARM_FEATURE_LOCALLY_STREAMING 1 +// CHECK-SME: __ARM_FEATURE_SME 1 +// +// RUN: %clang --target=aarch64 -march=armv9-a+sme2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2 %s +// CHECK-SME2: __ARM_FEATURE_LOCALLY_STREAMING 1 +// CHECK-SME2: __ARM_FEATURE_SME 1 +// CHECK-SME2: __ARM_FEATURE_SME2 1 diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c index f1f1bbbf66945c..cf96870b27acb3 100644 --- a/clang/test/Preprocessor/init-aarch64.c +++ b/clang/test/Preprocessor/init-aarch64.c @@ -32,6 +32,8 @@ // AARCH64-NEXT: #define __ARM_PCS_AAPCS64 1 // AARCH64-NEXT: #define __ARM_SIZEOF_MINIMAL_ENUM 4 // AARCH64-NEXT: #define __ARM_SIZEOF_WCHAR_T 4 +// AARCH64-NEXT: #define __ARM_STATE_ZA 1 +// AARCH64-NEXT: #define __ARM_STATE_ZT0 1 // AARCH64-NEXT: #define __ATOMIC_ACQUIRE 2 // AARCH64-NEXT: #define __ATOMIC_ACQ_REL 4 // AARCH64-NEXT: #define __ATOMIC_CONSUME 1 diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index 39d2c66f14b23f..30697af89c2ebe 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -764,6 +764,14 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICNTR-EXT %s // CHECK-ZICNTR-EXT: __riscv_zicntr 2000000{{$}} +// RUN: %clang --target=riscv32 \ +// RUN: -march=rv32i_zicond1p0 -x c -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s +// RUN: %clang --target=riscv64 \ +// RUN: -march=rv64i_zicond1p0 -x c -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s +// CHECK-ZICOND-EXT: __riscv_zicond 1000000{{$}} + // RUN: %clang --target=riscv32-unknown-linux-gnu \ // RUN: -march=rv32izicsr2p0 -x c -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICSR-EXT %s @@ -1332,14 +1340,6 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s // CHECK-ZICFILP-EXT: __riscv_zicfilp 4000{{$}} -// RUN: %clang --target=riscv32 -menable-experimental-extensions \ -// RUN: -march=rv32i_zicond1p0 -x c -E -dM %s \ -// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s -// RUN: %clang --target=riscv64 -menable-experimental-extensions \ -// RUN: -march=rv64i_zicond1p0 -x c -E -dM %s \ -// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s -// CHECK-ZICOND-EXT: __riscv_zicond 1000000{{$}} - // RUN: %clang --target=riscv32 -menable-experimental-extensions \ // RUN: -march=rv32i_zimop0p1 -x c -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c index 97409ae7d6040c..2bf1886951f1f7 100644 --- a/clang/test/Sema/aarch64-sme-func-attrs.c +++ b/clang/test/Sema/aarch64-sme-func-attrs.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify=expected-cpp -x c++ %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify=expected-cpp -x c++ %s // Valid attributes @@ -445,3 +445,12 @@ void conflicting_state_attrs_preserves_out_zt0(void) __arm_preserves("zt0") __ar // expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}} // expected-error@+1 {{conflicting attributes for state 'zt0'}} void conflicting_state_attrs_preserves_inout_zt0(void) __arm_preserves("zt0") __arm_inout("zt0"); + +// Test that we get a diagnostic for unimplemented case. +void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")) __arm_inout("za", "zt0") { + // expected-cpp-error@+4 {{call to a function that shares state other than 'za' from a function that has live 'za' state requires a spill/fill of ZA, which is not yet implemented}} + // expected-cpp-note@+3 {{add '__arm_preserves("za")' to the callee if it preserves ZA}} + // expected-error@+2 {{call to a function that shares state other than 'za' from a function that has live 'za' state requires a spill/fill of ZA, which is not yet implemented}} + // expected-note@+1 {{add '__arm_preserves("za")' to the callee if it preserves ZA}} + share_zt0_only(); +} diff --git a/clang/test/Sema/attr-riscv-rvv-vector-bits.c b/clang/test/Sema/attr-riscv-rvv-vector-bits.c index fe507a102cee1e..60ba2aa034f6e1 100644 --- a/clang/test/Sema/attr-riscv-rvv-vector-bits.c +++ b/clang/test/Sema/attr-riscv-rvv-vector-bits.c @@ -228,8 +228,19 @@ typedef vint8m1_t two_arguments __attribute__((riscv_rvv_vector_bits(2, 4))); // typedef vint8m1_t non_int_size1 __attribute__((riscv_rvv_vector_bits(2.0))); // expected-error {{'riscv_rvv_vector_bits' attribute requires an integer constant}} typedef vint8m1_t non_int_size2 __attribute__((riscv_rvv_vector_bits("256"))); // expected-error {{'riscv_rvv_vector_bits' attribute requires an integer constant}} -// bool types and LMUL != 1 are not supported. -typedef vbool1_t fixed_vbool1_t_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); // expected-error {{'riscv_rvv_vector_bits' attribute applied to non-RVV type 'vbool1_t'}} +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 2))); +typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 4))); +typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 8))); +#if __riscv_v_fixed_vlen / 16 >= 8 +typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 16))); +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 32))); +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 64))); +#endif // Attribute must be attached to a single RVV vector or predicate type. typedef void *badtype1 __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); // expected-error {{'riscv_rvv_vector_bits' attribute applied to non-RVV type 'void *'}} @@ -242,10 +253,13 @@ vint8m1_t non_typedef_type __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_ // Test that we can define non-local fixed-length RVV types (unsupported for // sizeless types). fixed_int8m1_t global_int8; +fixed_bool1_t global_bool1; extern fixed_int8m1_t extern_int8; +extern fixed_bool1_t extern_bool1; static fixed_int8m1_t static_int8; +static fixed_bool1_t static_bool1; fixed_int8m1_t *global_int8_ptr; extern fixed_int8m1_t *extern_int8_ptr; @@ -398,6 +412,20 @@ _Static_assert(sizeof(fixed_int64m8_t) == VECTOR_SIZE * 8, ""); _Static_assert(sizeof(fixed_float32m8_t) == VECTOR_SIZE * 8, ""); _Static_assert(sizeof(fixed_float64m8_t) == VECTOR_SIZE * 8, ""); +_Static_assert(sizeof(fixed_bool1_t) == VECTOR_SIZE, ""); +_Static_assert(sizeof(fixed_bool2_t) == VECTOR_SIZE / 2, ""); +_Static_assert(sizeof(fixed_bool4_t) == VECTOR_SIZE / 4, ""); +_Static_assert(sizeof(fixed_bool8_t) == VECTOR_SIZE / 8, ""); +#if __riscv_v_fixed_vlen / 16 >= 8 +_Static_assert(sizeof(fixed_bool16_t) == VECTOR_SIZE / 16, ""); +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +_Static_assert(sizeof(fixed_bool32_t) == VECTOR_SIZE / 32, ""); +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +_Static_assert(sizeof(fixed_bool64_t) == VECTOR_SIZE / 64, ""); +#endif + // --------------------------------------------------------------------------// // Alignof @@ -475,6 +503,20 @@ _Static_assert(__alignof__(fixed_uint64m8_t) == VECTOR_ALIGN, ""); _Static_assert(__alignof__(fixed_float32m8_t) == VECTOR_ALIGN, ""); _Static_assert(__alignof__(fixed_float64m8_t) == VECTOR_ALIGN, ""); +_Static_assert(__alignof__(fixed_bool1_t) == VECTOR_ALIGN, ""); +_Static_assert(__alignof__(fixed_bool2_t) == (sizeof(fixed_bool2_t) < VECTOR_ALIGN ? sizeof(fixed_bool2_t) : VECTOR_ALIGN), ""); +_Static_assert(__alignof__(fixed_bool4_t) == (sizeof(fixed_bool4_t) < VECTOR_ALIGN ? sizeof(fixed_bool4_t) : VECTOR_ALIGN), ""); +_Static_assert(__alignof__(fixed_bool8_t) == (sizeof(fixed_bool8_t) < VECTOR_ALIGN ? sizeof(fixed_bool8_t) : VECTOR_ALIGN), ""); +#if __riscv_v_fixed_vlen / 16 >= 8 +_Static_assert(__alignof__(fixed_bool16_t) == (sizeof(fixed_bool16_t) < VECTOR_ALIGN ? sizeof(fixed_bool16_t) : VECTOR_ALIGN), ""); +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +_Static_assert(__alignof__(fixed_bool32_t) == (sizeof(fixed_bool32_t) < VECTOR_ALIGN ? sizeof(fixed_bool32_t) : VECTOR_ALIGN), ""); +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +_Static_assert(__alignof__(fixed_bool64_t) == (sizeof(fixed_bool64_t) < VECTOR_ALIGN ? sizeof(fixed_bool64_t) : VECTOR_ALIGN), ""); +#endif + // --------------------------------------------------------------------------// // Structs @@ -580,6 +622,26 @@ TEST_CAST_VECTOR(uint64m8) TEST_CAST_VECTOR(float32m8) TEST_CAST_VECTOR(float64m8) +TEST_CAST_COMMON(bool1); +TEST_CAST_COMMON(bool2); +TEST_CAST_COMMON(bool4); +TEST_CAST_COMMON(bool8); +#if __riscv_v_fixed_vlen / 16 >= 8 +TEST_CAST_COMMON(bool16); +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +TEST_CAST_COMMON(bool32); +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +TEST_CAST_COMMON(bool64); +#endif + +// Test conversion between mask and uint8 is invalid, both have the same +// memory representation. +fixed_bool1_t to_fixed_bool1_t__from_vuint8m1_t(vuint8m1_t x) { return x; } // expected-error-re {{returning 'vuint8m1_t' (aka '__rvv_uint8m1_t') from a function with incompatible result type 'fixed_bool1_t' (vector of {{[0-9]+}} 'unsigned char' values)}} + +// --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// // Test the scalable and fixed-length types can be used interchangeably @@ -595,6 +657,14 @@ vfloat64m4_t __attribute__((overloadable)) vfunc(vfloat64m4_t op1, vfloat64m4_t vint32m8_t __attribute__((overloadable)) vfunc(vint32m8_t op1, vint32m8_t op2); vfloat64m8_t __attribute__((overloadable)) vfunc(vfloat64m8_t op1, vfloat64m8_t op2); +vbool1_t __attribute__((overloadable)) vfunc(vbool1_t op1, vbool1_t op2); +vbool2_t __attribute__((overloadable)) vfunc(vbool2_t op1, vbool2_t op2); +vbool4_t __attribute__((overloadable)) vfunc(vbool4_t op1, vbool4_t op2); +vbool8_t __attribute__((overloadable)) vfunc(vbool8_t op1, vbool8_t op2); +vbool16_t __attribute__((overloadable)) vfunc(vbool16_t op1, vbool16_t op2); +vbool32_t __attribute__((overloadable)) vfunc(vbool32_t op1, vbool32_t op2); +vbool64_t __attribute__((overloadable)) vfunc(vbool64_t op1, vbool64_t op2); + #define TEST_CALL(TYPE) \ fixed_##TYPE##_t \ call_##TYPE##_ff(fixed_##TYPE##_t op1, fixed_##TYPE##_t op2) { \ @@ -621,6 +691,20 @@ TEST_CALL(float64m4) TEST_CALL(int32m8) TEST_CALL(float64m8) +TEST_CALL(bool1) +TEST_CALL(bool2) +TEST_CALL(bool4) +TEST_CALL(bool8) +#if __riscv_v_fixed_vlen / 16 >= 8 +TEST_CALL(bool16) +#endif +#if __riscv_v_fixed_vlen / 32 >= 8 +TEST_CALL(bool32) +#endif +#if __riscv_v_fixed_vlen / 64 >= 8 +TEST_CALL(bool64) +#endif + // --------------------------------------------------------------------------// // Vector initialization diff --git a/clang/test/Sema/inline-asm-validate-mips.c b/clang/test/Sema/inline-asm-validate-mips.c new file mode 100644 index 00000000000000..7da248fe417b5c --- /dev/null +++ b/clang/test/Sema/inline-asm-validate-mips.c @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -triple mips64 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple mips64 -target-feature +soft-float -fsyntax-only -verify=softfloat %s + +// expected-no-diagnostics + +void test_f(float p) { + float result = p; + __asm__("" :: "f"(result)); // softfloat-error{{invalid input constraint 'f' in asm}} +} diff --git a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp index 8a610fa0e737e1..03a432e05851d1 100644 --- a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp +++ b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp @@ -1,13 +1,31 @@ -// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan -triple powerpc64le-unknown-unknown %s \ -// RUN: -menable-no-infs -menable-no-nans +// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \ +// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \ +// RUN: -menable-no-nans -std=c++23 -// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown %s +// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \ +// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \ +// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \ +// RUN: %s -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \ +// RUN: -menable-no-infs -std=c++23 // RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \ -// RUN: -menable-no-infs +// RUN: -menable-no-infs -funsafe-math-optimizations -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \ +// RUN: -menable-no-nans -std=c++23 // RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \ -// RUN: -menable-no-nans +// RUN: -funsafe-math-optimizations -menable-no-nans -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \ +// RUN: %s -Wno-nan-infinity-disabled -menable-no-infs -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \ +// RUN: %s -Wno-nan-infinity-disabled -menable-no-nans -std=c++23 // no-fast-no-diagnostics @@ -133,13 +151,41 @@ int compareit(float a, float b) { // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}} p = __builtin_isfinite(a); - // These should NOT warn, since they are not using NaN or infinity. +// These should NOT warn, since they are not using NaN or infinity. j = a > 1.1; j = b < 1.1; j = a >= 1.1; j = b <= 1.1; j = isunorderedf(a, b); +#ifndef INFINITY + j = a; +#endif +#ifndef NAN + j = b; +#endif +#ifdef INFINITY + j = a; +#endif +#ifdef NAN + j = b; +#endif +#if defined(INFINITY) + j = a; +#elifndef(INFINITY) + j = b; +#endif +#if defined(INFINITY) + j = a; +#elifndef(NAN) + j = b; +#endif +#if defined(NAN) + j = a; +#elifndef(INFINITY) + j = b; +#endif + // no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}} // no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}} // no-nan-warning@+2 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}} @@ -173,4 +219,4 @@ int compareit(float a, float b) { j = numeric_limits::infinity(); return 0; -} +} diff --git a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp index 19a575386e3293..51f9d325619ba0 100644 --- a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp +++ b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp @@ -1,16 +1,34 @@ // Use of NAN macro will trigger a warning "infinity defined in macro" because // on Windows the NAN macro is defined using INFINITY. See below. -// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan -triple powerpc64le-unknown-unknown %s \ -// RUN: -menable-no-infs -menable-no-nans +// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \ +// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \ +// RUN: -menable-no-nans -std=c++23 -// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown %s +// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \ +// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \ +// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \ +// RUN: %s -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \ +// RUN: -menable-no-infs -std=c++23 // RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \ -// RUN: -menable-no-infs +// RUN: -menable-no-infs -funsafe-math-optimizations -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \ +// RUN: -menable-no-nans -std=c++23 // RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \ -// RUN: -menable-no-nans +// RUN: -funsafe-math-optimizations -menable-no-nans -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \ +// RUN: %s -Wno-nan-infinity-disabled -menable-no-infs -std=c++23 + +// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \ +// RUN: %s -Wno-nan-infinity-disabled -menable-no-nans -std=c++23 // no-fast-no-diagnostics @@ -136,13 +154,41 @@ int compareit(float a, float b) { // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}} p = __builtin_isfinite(a); - // These should NOT warn, since they are not using NaN or infinity. +// These should NOT warn, since they are not using NaN or infinity. j = a > 1.1; j = b < 1.1; j = a >= 1.1; j = b <= 1.1; j = isunorderedf(a, b); +#ifndef INFINITY + j = a; +#endif +#ifndef NAN + j = b; +#endif +#ifdef INFINITY + j = a; +#endif +#ifdef NAN + j = b; +#endif +#if defined(INFINITY) + j = a; +#elifndef(INFINITY) + j = b; +#endif +#if defined(INFINITY) + j = a; +#elifndef(NAN) + j = b; +#endif +#if defined(NAN) + j = a; +#elifndef(INFINITY) + j = b; +#endif + // no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point option}} // no-inf-no-nan-warning@+3 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}} // no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}} @@ -176,4 +222,4 @@ int compareit(float a, float b) { j = numeric_limits::infinity(); return 0; -} +} diff --git a/clang/test/Sema/warn-int-in-bool-context.c b/clang/test/Sema/warn-int-in-bool-context.c index 0c94ebb391f3c5..99f3db9f8d41a7 100644 --- a/clang/test/Sema/warn-int-in-bool-context.c +++ b/clang/test/Sema/warn-int-in-bool-context.c @@ -72,3 +72,14 @@ int test(int a, unsigned b, enum num n) { // Don't warn in macros. return SHIFT(1, a); } + +int GH64356(int arg) { + if ((arg == 1) && (1 == 1)) return 1; + return 0; + + if ((64 > 32) && (32 < 64)) + return 2; + + if ((1 == 1) && (arg == 1)) return 1; + return 0; +} diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp index 531a6262287335..4a75392045d05a 100644 --- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp +++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp @@ -368,3 +368,29 @@ vector v{}; // expected-note@-2 {{in call to 'vector()'}} } + + +namespace GH82258 { + +template +constexpr auto none_of(R&& r, Pred pred) -> bool { return true; } + +struct info { int value; }; +consteval auto is_invalid(info i) -> bool { return false; } +constexpr info types[] = { {1}, {3}, {5}}; + +static_assert(none_of( + types, + +[](info i) consteval { + return is_invalid(i); + } +)); + +static_assert(none_of( + types, + []{ + return is_invalid; + }() +)); + +} diff --git a/clang/test/SemaCXX/cxx2b-static-operator.cpp b/clang/test/SemaCXX/cxx2b-static-operator.cpp new file mode 100644 index 00000000000000..4d6f1f76d13157 --- /dev/null +++ b/clang/test/SemaCXX/cxx2b-static-operator.cpp @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++23 %s + +// expected-no-diagnostics + +namespace A { + +struct Foo { + static int operator()(int a, int b) { return a + b; } + static int operator[](int a, int b) { return a + b; } +}; + +void ok() { + // Should pass regardless of const / volatile + Foo foo; + foo(1, 2); + foo[1, 2]; + + const Foo fooC; + fooC(1, 2); + fooC[1, 2]; + + const Foo fooV; + fooV(1, 2); + fooV[1, 2]; + + const volatile Foo fooCV; + fooCV(1, 2); + fooCV[1, 2]; +} + +} diff --git a/clang/test/SemaCXX/gh53815.cpp b/clang/test/SemaCXX/gh53815.cpp new file mode 100644 index 00000000000000..326c911c7bfaf5 --- /dev/null +++ b/clang/test/SemaCXX/gh53815.cpp @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s +// expected-no-diagnostics + +// Check that we don't crash due to forgetting to check for placeholders +// in the RHS of '.*'. + +template +static bool has_explicitly_named_overload() { + return requires { Fn().*&Fn::operator(); }; +} + +int main() { + has_explicitly_named_overload(); +} + +template +constexpr bool has_explicitly_named_overload_2() { + return requires { Fn().*&Fn::operator(); }; +} + +static_assert(!has_explicitly_named_overload_2()); diff --git a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp index bda6a65c02168b..d54b394df4eb84 100644 --- a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp +++ b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -D AVOID %s -// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s -// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow-all %s +// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -D AVOID %s +// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s +// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow-all %s // RUN: %clang_cc1 -std=c++17 -verify -fsyntax-only -Wshadow-all %s // RUN: %clang_cc1 -std=c++20 -verify -fsyntax-only -Wshadow-all %s @@ -179,3 +179,89 @@ void f() { #endif } } + +namespace GH71976 { +#ifdef AVOID +struct A { + int b = 5; + int foo() { + return [b = b]() { return b; }(); // no -Wshadow diagnostic, init-capture does not shadow b due to not capturing this + } +}; + +struct B { + int a; + void foo() { + auto b = [a = this->a] {}; // no -Wshadow diagnostic, init-capture does not shadow a due to not capturing his + } +}; + +struct C { + int b = 5; + int foo() { + return [a = b]() { + return [=, b = a]() { // no -Wshadow diagnostic, init-capture does not shadow b due to outer lambda + return b; + }(); + }(); + } +}; + +#else +struct A { + int b = 5; // expected-note {{previous}} + int foo() { + return [b = b]() { return b; }(); // expected-warning {{declaration shadows a field}} + } +}; + +struct B { + int a; // expected-note {{previous}} + void foo() { + auto b = [a = this->a] {}; // expected-warning {{declaration shadows a field}} + } +}; + +struct C { + int b = 5; // expected-note {{previous}} + int foo() { + return [a = b]() { + return [=, b = a]() { // expected-warning {{declaration shadows a field}} + return b; + }(); + }(); + } +}; + +struct D { + int b = 5; // expected-note {{previous}} + int foo() { + return [b = b, this]() { return b; }(); // expected-warning {{declaration shadows a field}} + } +}; + +struct E { + int b = 5; + int foo() { + return [a = b]() { // expected-note {{previous}} + return [=, a = a]() { // expected-warning {{shadows a local}} + return a; + }(); + }(); + } +}; + +#endif + +struct S { + int a ; +}; + +int foo() { + auto [a] = S{0}; // expected-note {{previous}} \ + // cxx14-warning {{decomposition declarations are a C++17 extension}} + [a = a] () { // expected-warning {{declaration shadows a structured binding}} + }(); +} + +} diff --git a/clang/test/SemaTemplate/concepts-friends.cpp b/clang/test/SemaTemplate/concepts-friends.cpp index 255b0858917fb6..0b008811f13621 100644 --- a/clang/test/SemaTemplate/concepts-friends.cpp +++ b/clang/test/SemaTemplate/concepts-friends.cpp @@ -478,3 +478,29 @@ template class Foo { }; } // namespace FriendOfFriend + +namespace GH86769 { + +template +concept X = true; + +template struct Y { + Y(T) {} + template friend struct Y; + template friend struct Y; + template friend struct Y; +}; + +template +struct Z { + // FIXME: This is ill-formed per C++11 [temp.param]p12: + // A default template argument shall not be specified in a friend class + // template declaration. + template friend struct Y; +}; + +template struct Y; +template struct Z; +Y y(1); + +} diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp index 7e431529427dff..0b7580f91043c7 100644 --- a/clang/test/SemaTemplate/concepts-lambda.cpp +++ b/clang/test/SemaTemplate/concepts-lambda.cpp @@ -149,3 +149,21 @@ void foo() { auto caller = make_caller.operator()<&S1::f1>(); } } // namespace ReturnTypeRequirementInLambda + +namespace GH73418 { +void foo() { + int x; + [&x](auto) { + return [](auto y) { + return [](auto obj, auto... params) + requires requires { + sizeof...(params); + [](auto... pack) { + return sizeof...(pack); + }(params...); + } + { return false; }(y); + }(x); + }(x); +} +} // namespace GH73418 diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp index 388ed7d4cced18..ec144d4f44ba8c 100644 --- a/clang/test/SemaTemplate/ctad.cpp +++ b/clang/test/SemaTemplate/ctad.cpp @@ -53,4 +53,4 @@ X x; template struct Y { Y(T); }; template struct Y ; Y y(1); -}; +} diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp index b5b8cadc909ce0..ad73daa8e214c3 100644 --- a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp +++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp @@ -336,3 +336,38 @@ template void bar(B b) { (b.operator Tbar(), ...); } } + +namespace ReportedRegression1 { + const char kt[] = "dummy"; + + template + class SomeTempl { }; + + template + class SomeTempl { + public: + int exit_code() const { return 0; } + }; + + int use() { + SomeTempl dummy; + return dummy.exit_code(); + } +} + +namespace ReportedRegression2 { + const char str[] = "dummy"; + + struct S { + S operator+(const char*) const; + }; + + template + void fn() { + auto s = S{} + in; + } + + void use() { + fn(); + } +} diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp index 5ee6092bb9bb7f..e122cea50f7268 100644 --- a/clang/tools/clang-format/ClangFormat.cpp +++ b/clang/tools/clang-format/ClangFormat.cpp @@ -399,7 +399,8 @@ class ClangFormatDiagConsumer : public DiagnosticConsumer { }; // Returns true on error. -static bool format(StringRef FileName, bool IsSTDIN) { +static bool format(StringRef FileName) { + const bool IsSTDIN = FileName == "-"; if (!OutputXML && Inplace && IsSTDIN) { errs() << "error: cannot use -i when reading from stdin.\n"; return false; @@ -545,24 +546,25 @@ static void PrintVersion(raw_ostream &OS) { } // Dump the configuration. -static int dumpConfig(bool IsSTDIN) { +static int dumpConfig() { std::unique_ptr Code; - - // `FileNames` must have at least "-" in it even if no file was specified. - assert(!FileNames.empty()); - - // Read in the code in case the filename alone isn't enough to detect the - // language. - ErrorOr> CodeOrErr = - MemoryBuffer::getFileOrSTDIN(FileNames[0]); - if (std::error_code EC = CodeOrErr.getError()) { - llvm::errs() << EC.message() << "\n"; - return 1; + // We can't read the code to detect the language if there's no file name. + if (!FileNames.empty()) { + // Read in the code in case the filename alone isn't enough to detect the + // language. + ErrorOr> CodeOrErr = + MemoryBuffer::getFileOrSTDIN(FileNames[0]); + if (std::error_code EC = CodeOrErr.getError()) { + llvm::errs() << EC.message() << "\n"; + return 1; + } + Code = std::move(CodeOrErr.get()); } - Code = std::move(CodeOrErr.get()); - llvm::Expected FormatStyle = - clang::format::getStyle(Style, IsSTDIN ? AssumeFileName : FileNames[0], + clang::format::getStyle(Style, + FileNames.empty() || FileNames[0] == "-" + ? AssumeFileName + : FileNames[0], FallbackStyle, Code ? Code->getBuffer() : ""); if (!FormatStyle) { llvm::errs() << llvm::toString(FormatStyle.takeError()) << "\n"; @@ -682,11 +684,8 @@ int main(int argc, const char **argv) { return 0; } - if (FileNames.empty()) - FileNames.push_back("-"); - if (DumpConfig) - return dumpConfig(FileNames[0] == "-"); + return dumpConfig(); if (!Files.empty()) { std::ifstream ExternalFileOfFiles{std::string(Files)}; @@ -699,7 +698,10 @@ int main(int argc, const char **argv) { errs() << "Clang-formating " << LineNo << " files\n"; } - if (FileNames.size() != 1 && + if (FileNames.empty()) + return clang::format::format("-"); + + if (FileNames.size() > 1 && (!Offsets.empty() || !Lengths.empty() || !LineRanges.empty())) { errs() << "error: -offset, -length and -lines can only be used for " "single file.\n"; @@ -709,14 +711,13 @@ int main(int argc, const char **argv) { unsigned FileNo = 1; bool Error = false; for (const auto &FileName : FileNames) { - const bool IsSTDIN = FileName == "-"; - if (!IsSTDIN && isIgnored(FileName)) + if (isIgnored(FileName)) continue; if (Verbose) { errs() << "Formatting [" << FileNo++ << "/" << FileNames.size() << "] " << FileName << "\n"; } - Error |= clang::format::format(FileName, IsSTDIN); + Error |= clang::format::format(FileName); } return Error ? 1 : 0; } diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 2a8d79359a49b4..6436581ddae5ae 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -231,6 +231,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { AfterFunctionDefinitionName); CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterIfMacros); CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterOverloadedOperator); + CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterPlacementOperator); CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, BeforeNonEmptyParentheses); CHECK_PARSE_NESTED_BOOL(SpacesInParensOptions, InCStyleCasts); CHECK_PARSE_NESTED_BOOL(SpacesInParensOptions, InConditionalStatements); @@ -609,24 +610,6 @@ TEST(ConfigParseTest, ParsesConfiguration) { SpaceBeforeParens, FormatStyle::SBPO_ControlStatementsExceptControlMacros); - Style.SpaceBeforeParens = FormatStyle::SBPO_Custom; - Style.SpaceBeforeParensOptions.AfterPlacementOperator = - FormatStyle::SpaceBeforeParensCustom::APO_Always; - CHECK_PARSE("SpaceBeforeParensOptions:\n" - " AfterPlacementOperator: Never", - SpaceBeforeParensOptions.AfterPlacementOperator, - FormatStyle::SpaceBeforeParensCustom::APO_Never); - - CHECK_PARSE("SpaceBeforeParensOptions:\n" - " AfterPlacementOperator: Always", - SpaceBeforeParensOptions.AfterPlacementOperator, - FormatStyle::SpaceBeforeParensCustom::APO_Always); - - CHECK_PARSE("SpaceBeforeParensOptions:\n" - " AfterPlacementOperator: Leave", - SpaceBeforeParensOptions.AfterPlacementOperator, - FormatStyle::SpaceBeforeParensCustom::APO_Leave); - // For backward compatibility: Style.SpacesInParens = FormatStyle::SIPO_Never; Style.SpacesInParensOptions = {}; diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index e5e763edf5b5bf..88877e53d014c6 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -1865,6 +1865,13 @@ TEST_F(FormatTest, UnderstandsMacros) { verifyFormat("MACRO(co_return##something)"); verifyFormat("#define A x:"); + + verifyFormat("#define Foo(Bar) {#Bar}", "#define Foo(Bar) \\\n" + " { \\\n" + " #Bar \\\n" + " }"); + verifyFormat("#define Foo(Bar) {#Bar}", "#define Foo(Bar) \\\n" + " { #Bar }"); } TEST_F(FormatTest, ShortBlocksInMacrosDontMergeWithCodeAfterMacro) { @@ -10865,7 +10872,7 @@ TEST_F(FormatTest, UnderstandsTemplateParameters) { verifyFormat("some_templated_type"); verifyFormat("#define FOO(typeName, realClass) \\\n" - " { #typeName, foo(new foo(#typeName)) }", + " {#typeName, foo(new foo(#typeName))}", getLLVMStyleWithColumns(60)); } @@ -11347,35 +11354,31 @@ TEST_F(FormatTest, UnderstandsNewAndDelete) { FormatStyle AfterPlacementOperator = getLLVMStyle(); AfterPlacementOperator.SpaceBeforeParens = FormatStyle::SBPO_Custom; - EXPECT_EQ( - AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator, - FormatStyle::SpaceBeforeParensCustom::APO_Leave); + EXPECT_TRUE( + AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator); verifyFormat("new (buf) int;", AfterPlacementOperator); - verifyFormat("new(buf) int;", AfterPlacementOperator); - - AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator = - FormatStyle::SpaceBeforeParensCustom::APO_Never; verifyFormat("struct A {\n" " int *a;\n" - " A(int *p) : a(new(p) int) {\n" - " new(p) int;\n" - " int *b = new(p) int;\n" - " int *c = new(p) int(3);\n" - " delete(b);\n" + " A(int *p) : a(new (p) int) {\n" + " new (p) int;\n" + " int *b = new (p) int;\n" + " int *c = new (p) int(3);\n" + " delete (b);\n" " }\n" "};", AfterPlacementOperator); verifyFormat("void operator new(void *foo) ATTRIB;", AfterPlacementOperator); AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator = - FormatStyle::SpaceBeforeParensCustom::APO_Always; + false; + verifyFormat("new(buf) int;", AfterPlacementOperator); verifyFormat("struct A {\n" " int *a;\n" - " A(int *p) : a(new (p) int) {\n" - " new (p) int;\n" - " int *b = new (p) int;\n" - " int *c = new (p) int(3);\n" - " delete (b);\n" + " A(int *p) : a(new(p) int) {\n" + " new(p) int;\n" + " int *b = new(p) int;\n" + " int *c = new(p) int(3);\n" + " delete(b);\n" " }\n" "};", AfterPlacementOperator); @@ -26860,6 +26863,7 @@ TEST_F(FormatTest, RemoveParentheses) { EXPECT_EQ(Style.RemoveParentheses, FormatStyle::RPS_Leave); Style.RemoveParentheses = FormatStyle::RPS_MultipleParentheses; + verifyFormat("#define Foo(...) foo((__VA_ARGS__))", Style); verifyFormat("int x __attribute__((aligned(16))) = 0;", Style); verifyFormat("decltype((foo->bar)) baz;", Style); verifyFormat("class __declspec(dllimport) X {};", @@ -26894,6 +26898,7 @@ TEST_F(FormatTest, RemoveParentheses) { verifyFormat("return (({ 0; }));", "return ((({ 0; })));", Style); Style.RemoveParentheses = FormatStyle::RPS_ReturnStatement; + verifyFormat("#define Return0 return (0);", Style); verifyFormat("return 0;", "return (0);", Style); verifyFormat("co_return 0;", "co_return ((0));", Style); verifyFormat("return 0;", "return (((0)));", Style); @@ -27021,10 +27026,16 @@ TEST_F(FormatTest, PPBranchesInBracedInit) { "};"); } -TEST_F(FormatTest, StreamOutputOperator) { - verifyFormat("std::cout << \"foo\" << \"bar\" << baz;"); - verifyFormat("std::cout << \"foo\\n\"\n" - " << \"bar\";"); +TEST_F(FormatTest, PPDirectivesAndCommentsInBracedInit) { + verifyFormat("{\n" + " char *a[] = {\n" + " /* abc */ \"abc\",\n" + "#if FOO\n" + " /* xyz */ \"xyz\",\n" + "#endif\n" + " /* last */ \"last\"};\n" + "}", + getLLVMStyleWithColumns(30)); } TEST_F(FormatTest, BreakAdjacentStringLiterals) { @@ -27041,6 +27052,7 @@ TEST_F(FormatTest, BreakAdjacentStringLiterals) { Style.BreakAdjacentStringLiterals = false; verifyFormat(Code, Style); } + } // namespace } // namespace test } // namespace format diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp index c249f4d9333fd0..d7c432ed031d34 100644 --- a/clang/unittests/Format/FormatTestComments.cpp +++ b/clang/unittests/Format/FormatTestComments.cpp @@ -376,6 +376,10 @@ TEST_F(FormatTestComments, RemovesTrailingWhitespaceOfComments) { TEST_F(FormatTestComments, UnderstandsBlockComments) { verifyFormat("f(/*noSpaceAfterParameterNamingComment=*/true);"); verifyFormat("void f() { g(/*aaa=*/x, /*bbb=*/!y, /*c=*/::c); }"); + verifyFormat("fooooooooooooooooooooooooooooo(\n" + " /*qq_=*/move(q), [this, b](bar b) {},\n" + " c);", + getLLVMStyleWithColumns(60)); EXPECT_EQ("f(aaaaaaaaaaaaaaaaaaaaaaaaa, /* Trailing comment for aa... */\n" " bbbbbbbbbbbbbbbbbbbbbbbbb);", format("f(aaaaaaaaaaaaaaaaaaaaaaaaa , \\\n" diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 3dbf504c35ed55..44ebad9d5a872a 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -611,6 +611,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) { EXPECT_TOKEN(Tokens[13], tok::r_paren, TT_Unknown); EXPECT_TOKEN(Tokens[14], tok::star, TT_BinaryOperator); + Tokens = annotate("#define foo(i) ((i) - bar)"); + ASSERT_EQ(Tokens.size(), 14u) << Tokens; + EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_Unknown); + EXPECT_TOKEN(Tokens[10], tok::minus, TT_BinaryOperator); + Tokens = annotate("return (Foo) & 10;"); ASSERT_EQ(Tokens.size(), 8u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::r_paren, TT_Unknown); @@ -1867,6 +1872,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsTrailingReturnArrow) { ASSERT_EQ(Tokens.size(), 12u) << Tokens; EXPECT_TOKEN(Tokens[7], tok::arrow, TT_Unknown); + Tokens = annotate("__attribute__((cold)) C() : Base(obj->func()) {}"); + ASSERT_EQ(Tokens.size(), 21u) << Tokens; + EXPECT_TOKEN(Tokens[13], tok::arrow, TT_Unknown); + // Mixed Tokens = annotate("auto f() -> int { auto a = b()->c; }"); ASSERT_EQ(Tokens.size(), 18u) << Tokens; @@ -1880,14 +1889,20 @@ TEST_F(TokenAnnotatorTest, UnderstandHashInMacro) { " #Bar \\\n" " }"); ASSERT_EQ(Tokens.size(), 11u) << Tokens; - EXPECT_BRACE_KIND(Tokens[6], BK_Block); - EXPECT_BRACE_KIND(Tokens[9], BK_Block); + EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit); Tokens = annotate("#define Foo(Bar) \\\n" " { #Bar }"); ASSERT_EQ(Tokens.size(), 11u) << Tokens; - EXPECT_BRACE_KIND(Tokens[6], BK_Block); - EXPECT_BRACE_KIND(Tokens[9], BK_Block); + EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit); + + Tokens = annotate("#define FOO(typeName, realClass) \\\n" + " {#typeName, foo(new foo(#typeName))}"); + ASSERT_EQ(Tokens.size(), 29u) << Tokens; + EXPECT_BRACE_KIND(Tokens[8], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[27], BK_BracedInit); } TEST_F(TokenAnnotatorTest, UnderstandsAttributeMacros) { @@ -2590,15 +2605,33 @@ TEST_F(TokenAnnotatorTest, BraceKind) { EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_FunctionLBrace); EXPECT_BRACE_KIND(Tokens[4], BK_Block); EXPECT_BRACE_KIND(Tokens[6], BK_Block); -} -TEST_F(TokenAnnotatorTest, StreamOperator) { - auto Tokens = annotate("\"foo\\n\" << aux << \"foo\\n\" << \"foo\";"); - ASSERT_EQ(Tokens.size(), 9u) << Tokens; - EXPECT_FALSE(Tokens[1]->MustBreakBefore); - EXPECT_FALSE(Tokens[3]->MustBreakBefore); - // Only break between string literals if the former ends with \n. - EXPECT_TRUE(Tokens[5]->MustBreakBefore); + Tokens = annotate("struct Foo {\n" + " Foo() {};\n" + " ~Foo() {};\n" + "};"); + ASSERT_EQ(Tokens.size(), 19u) << Tokens; + EXPECT_TOKEN(Tokens[3], tok::identifier, TT_CtorDtorDeclName); + EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_FunctionLBrace); + EXPECT_BRACE_KIND(Tokens[6], BK_Block); + EXPECT_BRACE_KIND(Tokens[7], BK_Block); + EXPECT_TOKEN(Tokens[10], tok::identifier, TT_CtorDtorDeclName); + EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace); + EXPECT_BRACE_KIND(Tokens[13], BK_Block); + EXPECT_BRACE_KIND(Tokens[14], BK_Block); + + Tokens = annotate("{\n" + " char *a[] = {\n" + " /* abc */ \"abc\",\n" + "#if FOO\n" + " /* xyz */ \"xyz\",\n" + "#endif\n" + " /* last */ \"last\"};\n" + "}"); + ASSERT_EQ(Tokens.size(), 25u) << Tokens; + EXPECT_BRACE_KIND(Tokens[0], BK_Block); + EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[21], BK_BracedInit); } } // namespace diff --git a/clang/unittests/Serialization/CMakeLists.txt b/clang/unittests/Serialization/CMakeLists.txt index 10d7de970c643d..e7eebd0cb98239 100644 --- a/clang/unittests/Serialization/CMakeLists.txt +++ b/clang/unittests/Serialization/CMakeLists.txt @@ -10,6 +10,7 @@ add_clang_unittest(SerializationTests InMemoryModuleCacheTest.cpp ModuleCacheTest.cpp NoCommentsTest.cpp + PreambleInNamedModulesTest.cpp SourceLocationEncodingTest.cpp VarDeclConstantInitTest.cpp ) diff --git a/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp new file mode 100644 index 00000000000000..d26e1cb633654f --- /dev/null +++ b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp @@ -0,0 +1,132 @@ +//===- unittests/Serialization/PreambleInNamedModulesTest.cpp -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Frontend/CompilerInstance.h" +#include "clang/Frontend/CompilerInvocation.h" +#include "clang/Frontend/FrontendActions.h" +#include "clang/Frontend/PrecompiledPreamble.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +#include "gtest/gtest.h" + +using namespace llvm; +using namespace clang; + +namespace { + +class PreambleInNamedModulesTest : public ::testing::Test { + void SetUp() override { + ASSERT_FALSE(sys::fs::createUniqueDirectory("modules-test", TestDir)); + } + + void TearDown() override { sys::fs::remove_directories(TestDir); } + +public: + using PathType = SmallString<256>; + + PathType TestDir; + + void addFile(StringRef Path, StringRef Contents, PathType &AbsPath) { + ASSERT_FALSE(sys::path::is_absolute(Path)); + + AbsPath = TestDir; + sys::path::append(AbsPath, Path); + + ASSERT_FALSE( + sys::fs::create_directories(llvm::sys::path::parent_path(AbsPath))); + + std::error_code EC; + llvm::raw_fd_ostream OS(AbsPath, EC); + ASSERT_FALSE(EC); + OS << Contents; + } + + void addFile(StringRef Path, StringRef Contents) { + PathType UnusedAbsPath; + addFile(Path, Contents, UnusedAbsPath); + } +}; + +// Testing that the use of Preamble in named modules can work basically. +// See https://github.com/llvm/llvm-project/issues/80570 +TEST_F(PreambleInNamedModulesTest, BasicTest) { + addFile("foo.h", R"cpp( +enum class E { + A, + B, + C, + D +}; + )cpp"); + + PathType MainFilePath; + addFile("A.cppm", R"cpp( +module; +#include "foo.h" +export module A; +export using ::E; + )cpp", + MainFilePath); + + IntrusiveRefCntPtr Diags = + CompilerInstance::createDiagnostics(new DiagnosticOptions()); + IntrusiveRefCntPtr VFS = + llvm::vfs::createPhysicalFileSystem(); + + CreateInvocationOptions CIOpts; + CIOpts.Diags = Diags; + CIOpts.VFS = VFS; + + const char *Args[] = {"clang++", "-std=c++20", "-working-directory", + TestDir.c_str(), MainFilePath.c_str()}; + std::shared_ptr Invocation = + createInvocation(Args, CIOpts); + ASSERT_TRUE(Invocation); + + llvm::ErrorOr> ContentsBuffer = + llvm::MemoryBuffer::getFile(MainFilePath, /*IsText=*/true); + EXPECT_TRUE(ContentsBuffer); + std::unique_ptr Buffer = std::move(*ContentsBuffer); + + PreambleBounds Bounds = + ComputePreambleBounds(Invocation->getLangOpts(), *Buffer, 0); + + PreambleCallbacks Callbacks; + llvm::ErrorOr BuiltPreamble = PrecompiledPreamble::Build( + *Invocation, Buffer.get(), Bounds, *Diags, VFS, + std::make_shared(), + /*StoreInMemory=*/false, /*StoragePath=*/TestDir, Callbacks); + + ASSERT_FALSE(Diags->hasErrorOccurred()); + + EXPECT_TRUE(BuiltPreamble); + EXPECT_TRUE(BuiltPreamble->CanReuse(*Invocation, *Buffer, Bounds, *VFS)); + BuiltPreamble->OverridePreamble(*Invocation, VFS, Buffer.get()); + + auto Clang = std::make_unique( + std::make_shared()); + Clang->setInvocation(std::move(Invocation)); + Clang->setDiagnostics(Diags.get()); + + if (auto VFSWithRemapping = createVFSFromCompilerInvocation( + Clang->getInvocation(), Clang->getDiagnostics(), VFS)) + VFS = VFSWithRemapping; + + Clang->createFileManager(VFS); + EXPECT_TRUE(Clang->createTarget()); + + Buffer.release(); + + SyntaxOnlyAction Action; + EXPECT_TRUE(Clang->ExecuteAction(Action)); + EXPECT_FALSE(Clang->getDiagnosticsPtr()->hasErrorOccurred()); +} + +} // namespace diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt index c6d51863fb1b5c..93744f46060236 100644 --- a/clang/utils/perf-training/CMakeLists.txt +++ b/clang/utils/perf-training/CMakeLists.txt @@ -1,6 +1,10 @@ +include(LLVMExternalProjectUtils) + set(CLANG_PGO_TRAINING_DATA "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "The path to a lit testsuite containing samples for PGO and order file generation" ) +set(CLANG_PGO_TRAINING_DATA_SOURCE_DIR OFF CACHE STRING "Path to source directory containing cmake project with source files to use for generating pgo data") +set(CLANG_PGO_TRAINING_DEPS "" CACHE STRING "Extra dependencies needed to build the PGO training data.") if(LLVM_BUILD_INSTRUMENTED) configure_lit_site_cfg( @@ -11,11 +15,11 @@ if(LLVM_BUILD_INSTRUMENTED) add_lit_testsuite(generate-profraw "Generating clang PGO data" ${CMAKE_CURRENT_BINARY_DIR}/pgo-data/ EXCLUDE_FROM_CHECK_ALL - DEPENDS clang clear-profraw ${CLANG_PERF_TRAINING_DEPS} + DEPENDS clang clear-profraw ${CLANG_PGO_TRAINING_DEPS} ) add_custom_target(clear-profraw - COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} profraw + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_BINARY_DIR}/profiles/ profraw COMMENT "Clearing old profraw data") if(NOT LLVM_PROFDATA) @@ -26,9 +30,14 @@ if(LLVM_BUILD_INSTRUMENTED) message(STATUS "To enable merging PGO data LLVM_PROFDATA has to point to llvm-profdata") else() add_custom_target(generate-profdata - COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge ${LLVM_PROFDATA} ${CMAKE_CURRENT_BINARY_DIR}/clang.profdata ${CMAKE_CURRENT_BINARY_DIR} + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge ${LLVM_PROFDATA} ${CMAKE_CURRENT_BINARY_DIR}/clang.profdata ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_BINARY_DIR}/profiles/ COMMENT "Merging profdata" DEPENDS generate-profraw) + if (CLANG_PGO_TRAINING_DATA_SOURCE_DIR) + llvm_ExternalProject_Add(generate-profraw-external ${CLANG_PGO_TRAINING_DATA_SOURCE_DIR} + USE_TOOLCHAIN EXLUDE_FROM_ALL NO_INSTALL DEPENDS generate-profraw) + add_dependencies(generate-profdata generate-profraw-external) + endif() endif() endif() diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py index 99d6a3333b6ef0..3e92cd38a71451 100644 --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -30,26 +30,28 @@ def findFilesWithExtension(path, extension): def clean(args): - if len(args) != 2: + if len(args) < 2: print( - "Usage: %s clean \n" % __file__ + "Usage: %s clean \n" % __file__ + "\tRemoves all files with extension from ." ) return 1 - for filename in findFilesWithExtension(args[0], args[1]): - os.remove(filename) + for path in args[1:-1]: + for filename in findFilesWithExtension(path, args[-1]): + os.remove(filename) return 0 def merge(args): - if len(args) != 3: + if len(args) < 3: print( - "Usage: %s merge \n" % __file__ + "Usage: %s merge \n" % __file__ + "\tMerges all profraw files from path into output." ) return 1 cmd = [args[0], "merge", "-o", args[1]] - cmd.extend(findFilesWithExtension(args[2], "profraw")) + for path in args[2:]: + cmd.extend(findFilesWithExtension(path, "profraw")) subprocess.check_call(cmd) return 0 diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index f0d835fc091cea..c9d21f096a34c6 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -12630,7 +12630,7 @@

C++ defect report implementation status

2137 CD4 List-initialization from object of same type - Clang 18 + Unknown 2138 @@ -13674,7 +13674,7 @@

C++ defect report implementation status

2311 open Missed case for guaranteed copy elision - Clang 18 + Not resolved 2312 diff --git a/compiler-rt/lib/builtins/divtc3.c b/compiler-rt/lib/builtins/divtc3.c index e970cef574b21d..099de5802daf0e 100644 --- a/compiler-rt/lib/builtins/divtc3.c +++ b/compiler-rt/lib/builtins/divtc3.c @@ -13,7 +13,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_TF_MODE) +#if defined(CRT_HAS_F128) // Returns: the quotient of (a + ib) / (c + id) diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h index af406e760497a4..c4f0a5b9587f77 100644 --- a/compiler-rt/lib/builtins/fp_lib.h +++ b/compiler-rt/lib/builtins/fp_lib.h @@ -22,6 +22,7 @@ #include "int_lib.h" #include "int_math.h" +#include "int_types.h" #include #include #include @@ -93,13 +94,14 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { COMPILER_RT_ABI fp_t __adddf3(fp_t a, fp_t b); #elif defined QUAD_PRECISION -#if defined(CRT_HAS_TF_MODE) +#if defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT) typedef uint64_t half_rep_t; typedef __uint128_t rep_t; typedef __int128_t srep_t; typedef tf_float fp_t; #define HALF_REP_C UINT64_C #define REP_C (__uint128_t) +#if defined(CRT_HAS_IEEE_TF) // Note: Since there is no explicit way to tell compiler the constant is a // 128-bit integer, we let the constant be casted to 128-bit integer #define significandBits 112 @@ -188,7 +190,10 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { #undef Word_HiMask #undef Word_LoMask #undef Word_FullMask -#endif // defined(CRT_HAS_TF_MODE) +#endif // defined(CRT_HAS_IEEE_TF) +#else +typedef long double fp_t; +#endif // defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT) #else #error SINGLE_PRECISION, DOUBLE_PRECISION or QUAD_PRECISION must be defined. #endif @@ -196,19 +201,6 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { #if defined(SINGLE_PRECISION) || defined(DOUBLE_PRECISION) || \ (defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE)) #define typeWidth (sizeof(rep_t) * CHAR_BIT) -#define exponentBits (typeWidth - significandBits - 1) -#define maxExponent ((1 << exponentBits) - 1) -#define exponentBias (maxExponent >> 1) - -#define implicitBit (REP_C(1) << significandBits) -#define significandMask (implicitBit - 1U) -#define signBit (REP_C(1) << (significandBits + exponentBits)) -#define absMask (signBit - 1U) -#define exponentMask (absMask ^ significandMask) -#define oneRep ((rep_t)exponentBias << significandBits) -#define infRep exponentMask -#define quietBit (implicitBit >> 1) -#define qnanRep (exponentMask | quietBit) static __inline rep_t toRep(fp_t x) { const union { @@ -226,6 +218,21 @@ static __inline fp_t fromRep(rep_t x) { return rep.f; } +#if !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF) +#define exponentBits (typeWidth - significandBits - 1) +#define maxExponent ((1 << exponentBits) - 1) +#define exponentBias (maxExponent >> 1) + +#define implicitBit (REP_C(1) << significandBits) +#define significandMask (implicitBit - 1U) +#define signBit (REP_C(1) << (significandBits + exponentBits)) +#define absMask (signBit - 1U) +#define exponentMask (absMask ^ significandMask) +#define oneRep ((rep_t)exponentBias << significandBits) +#define infRep exponentMask +#define quietBit (implicitBit >> 1) +#define qnanRep (exponentMask | quietBit) + static __inline int normalize(rep_t *significand) { const int shift = rep_clz(*significand) - rep_clz(implicitBit); *significand <<= shift; @@ -328,6 +335,8 @@ static __inline fp_t __compiler_rt_scalbnX(fp_t x, int y) { return fromRep(sign | ((rep_t)exp << significandBits) | sig); } +#endif // !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF) + // Avoid using fmax from libm. static __inline fp_t __compiler_rt_fmaxX(fp_t x, fp_t y) { // If either argument is NaN, return the other argument. If both are NaN, @@ -405,6 +414,8 @@ static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) { #define __compiler_rt_logbl crt_logbl #define __compiler_rt_scalbnl crt_scalbnl #define __compiler_rt_fmaxl crt_fmaxl +#define crt_fabstf crt_fabsl +#define crt_copysigntf crt_copysignl #else #error Unsupported TF mode type #endif diff --git a/compiler-rt/lib/builtins/i386/chkstk.S b/compiler-rt/lib/builtins/i386/chkstk.S index a84bb0ee300705..cdd9a4c2a57522 100644 --- a/compiler-rt/lib/builtins/i386/chkstk.S +++ b/compiler-rt/lib/builtins/i386/chkstk.S @@ -14,7 +14,6 @@ .text .balign 4 DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function -DEFINE_COMPILERRT_FUNCTION(_chkstk) push %ecx cmp $0x1000,%eax lea 8(%esp),%ecx // esp before calling this routine -> ecx @@ -35,7 +34,6 @@ DEFINE_COMPILERRT_FUNCTION(_chkstk) push (%eax) // push return address onto the stack sub %esp,%eax // restore the original value in eax ret -END_COMPILERRT_FUNCTION(_chkstk) END_COMPILERRT_FUNCTION(_alloca) #endif // __i386__ diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h index 7624c728061518..ca97391fc28466 100644 --- a/compiler-rt/lib/builtins/int_types.h +++ b/compiler-rt/lib/builtins/int_types.h @@ -189,12 +189,16 @@ typedef long double tf_float; #define CRT_LDBL_IEEE_F128 #endif #define TF_C(x) x##L -#elif __LDBL_MANT_DIG__ == 113 -// Use long double instead of __float128 if it matches the IEEE 128-bit format. +#elif __LDBL_MANT_DIG__ == 113 || \ + (__FLT_RADIX__ == 16 && __LDBL_MANT_DIG__ == 28) +// Use long double instead of __float128 if it matches the IEEE 128-bit format +// or the IBM hexadecimal format. #define CRT_LDBL_128BIT #define CRT_HAS_F128 +#if __LDBL_MANT_DIG__ == 113 #define CRT_HAS_IEEE_TF #define CRT_LDBL_IEEE_F128 +#endif typedef long double tf_float; #define TF_C(x) x##L #elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__) diff --git a/compiler-rt/lib/builtins/multc3.c b/compiler-rt/lib/builtins/multc3.c index f20e53ccbf233b..61a3f45e47279c 100644 --- a/compiler-rt/lib/builtins/multc3.c +++ b/compiler-rt/lib/builtins/multc3.c @@ -15,7 +15,7 @@ #include "int_lib.h" #include "int_math.h" -#if defined(CRT_HAS_TF_MODE) +#if defined(CRT_HAS_F128) // Returns: the product of a + ib and c + id diff --git a/compiler-rt/lib/builtins/riscv/restore.S b/compiler-rt/lib/builtins/riscv/restore.S index 73f64a920d6698..6f43842c8ca684 100644 --- a/compiler-rt/lib/builtins/riscv/restore.S +++ b/compiler-rt/lib/builtins/riscv/restore.S @@ -22,6 +22,8 @@ #if __riscv_xlen == 32 +#ifndef __riscv_32e + .globl __riscv_restore_12 .type __riscv_restore_12,@function __riscv_restore_12: @@ -86,8 +88,29 @@ __riscv_restore_0: addi sp, sp, 16 ret +#else + + .globl __riscv_restore_2 + .type __riscv_restore_2,@function + .globl __riscv_restore_1 + .type __riscv_restore_1,@function + .globl __riscv_restore_0 + .type __riscv_restore_0,@function +__riscv_restore_2: +__riscv_restore_1: +__riscv_restore_0: + lw s1, 0(sp) + lw s0, 4(sp) + lw ra, 8(sp) + addi sp, sp, 12 + ret + +#endif + #elif __riscv_xlen == 64 +#ifndef __riscv_64e + .globl __riscv_restore_12 .type __riscv_restore_12,@function __riscv_restore_12: @@ -161,6 +184,25 @@ __riscv_restore_0: addi sp, sp, 16 ret +#else + + .globl __riscv_restore_2 + .type __riscv_restore_2,@function + .globl __riscv_restore_1 + .type __riscv_restore_1,@function + .globl __riscv_restore_0 + .type __riscv_restore_0,@function +__riscv_restore_2: +__riscv_restore_1: +__riscv_restore_0: + ld s1, 0(sp) + ld s0, 8(sp) + ld ra, 16(sp) + addi sp, sp, 24 + ret + +#endif + #else # error "xlen must be 32 or 64 for save-restore implementation #endif diff --git a/compiler-rt/lib/builtins/riscv/save.S b/compiler-rt/lib/builtins/riscv/save.S index 85501aeb4c2e93..3e044179ff7f1d 100644 --- a/compiler-rt/lib/builtins/riscv/save.S +++ b/compiler-rt/lib/builtins/riscv/save.S @@ -18,6 +18,8 @@ #if __riscv_xlen == 32 +#ifndef __riscv_32e + .globl __riscv_save_12 .type __riscv_save_12,@function __riscv_save_12: @@ -92,8 +94,29 @@ __riscv_save_0: sw ra, 12(sp) jr t0 +#else + + .globl __riscv_save_2 + .type __riscv_save_2,@function + .globl __riscv_save_1 + .type __riscv_save_1,@function + .globl __riscv_save_0 + .type __riscv_save_0,@function +__riscv_save_2: +__riscv_save_1: +__riscv_save_0: + addi sp, sp, -12 + sw s1, 0(sp) + sw s0, 4(sp) + sw ra, 8(sp) + jr t0 + +#endif + #elif __riscv_xlen == 64 +#ifndef __riscv_64e + .globl __riscv_save_12 .type __riscv_save_12,@function __riscv_save_12: @@ -181,6 +204,25 @@ __riscv_save_0: sd ra, 8(sp) jr t0 +#else + + .globl __riscv_save_2 + .type __riscv_save_2,@function + .globl __riscv_save_1 + .type __riscv_save_1,@function + .globl __riscv_save_0 + .type __riscv_save_0,@function +__riscv_save_2: +__riscv_save_1: +__riscv_save_0: + addi sp, sp, -24 + sd s1, 0(sp) + sd s0, 8(sp) + sd ra, 16(sp) + jr t0 + +#endif + #else # error "xlen must be 32 or 64 for save-restore implementation #endif diff --git a/compiler-rt/lib/builtins/x86_64/chkstk.S b/compiler-rt/lib/builtins/x86_64/chkstk.S index 494ee261193bc7..ad7953a116ac7e 100644 --- a/compiler-rt/lib/builtins/x86_64/chkstk.S +++ b/compiler-rt/lib/builtins/x86_64/chkstk.S @@ -18,7 +18,6 @@ .text .balign 4 DEFINE_COMPILERRT_FUNCTION(___chkstk_ms) -DEFINE_COMPILERRT_FUNCTION(__chkstk) push %rcx push %rax cmp $0x1000,%rax @@ -36,7 +35,6 @@ DEFINE_COMPILERRT_FUNCTION(__chkstk) pop %rax pop %rcx ret -END_COMPILERRT_FUNCTION(__chkstk) END_COMPILERRT_FUNCTION(___chkstk_ms) #endif // __x86_64__ diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp index 85b796bd6349c8..3af26e9f64c925 100644 --- a/compiler-rt/lib/dfsan/dfsan_custom.cpp +++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp @@ -55,6 +55,10 @@ using namespace __dfsan; #define DECLARE_WEAK_INTERCEPTOR_HOOK(f, ...) \ SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void f(__VA_ARGS__); +#define WRAPPER_ALIAS(fun, real) \ + SANITIZER_INTERFACE_ATTRIBUTE void __dfsw_##fun() ALIAS(__dfsw_##real); \ + SANITIZER_INTERFACE_ATTRIBUTE void __dfso_##fun() ALIAS(__dfso_##real); + // Async-safe, non-reentrant spin lock. class SignalSpinLocker { public: @@ -1197,16 +1201,20 @@ char *__dfso_strcpy(char *dest, const char *src, dfsan_label dst_label, *ret_origin = dst_origin; return ret; } +} -static long int dfsan_strtol(const char *nptr, char **endptr, int base, - char **tmp_endptr) { +template +static ALWAYS_INLINE auto dfsan_strtol_impl( + Fn real, const char *nptr, char **endptr, int base, + char **tmp_endptr) -> decltype(real(nullptr, nullptr, 0)) { assert(tmp_endptr); - long int ret = strtol(nptr, tmp_endptr, base); + auto ret = real(nptr, tmp_endptr, base); if (endptr) *endptr = *tmp_endptr; return ret; } +extern "C" { static void dfsan_strtolong_label(const char *nptr, const char *tmp_endptr, dfsan_label base_label, dfsan_label *ret_label) { @@ -1236,30 +1244,6 @@ static void dfsan_strtolong_origin(const char *nptr, const char *tmp_endptr, } } -SANITIZER_INTERFACE_ATTRIBUTE -long int __dfsw_strtol(const char *nptr, char **endptr, int base, - dfsan_label nptr_label, dfsan_label endptr_label, - dfsan_label base_label, dfsan_label *ret_label) { - char *tmp_endptr; - long int ret = dfsan_strtol(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -long int __dfso_strtol(const char *nptr, char **endptr, int base, - dfsan_label nptr_label, dfsan_label endptr_label, - dfsan_label base_label, dfsan_label *ret_label, - dfsan_origin nptr_origin, dfsan_origin endptr_origin, - dfsan_origin base_origin, dfsan_origin *ret_origin) { - char *tmp_endptr; - long int ret = dfsan_strtol(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin, - ret_origin); - return ret; -} - static double dfsan_strtod(const char *nptr, char **endptr, char **tmp_endptr) { assert(tmp_endptr); double ret = strtod(nptr, tmp_endptr); @@ -1307,108 +1291,40 @@ double __dfso_strtod(const char *nptr, char **endptr, dfsan_label nptr_label, return ret; } -static long long int dfsan_strtoll(const char *nptr, char **endptr, int base, - char **tmp_endptr) { - assert(tmp_endptr); - long long int ret = strtoll(nptr, tmp_endptr, base); - if (endptr) - *endptr = *tmp_endptr; - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -long long int __dfsw_strtoll(const char *nptr, char **endptr, int base, - dfsan_label nptr_label, dfsan_label endptr_label, - dfsan_label base_label, dfsan_label *ret_label) { - char *tmp_endptr; - long long int ret = dfsan_strtoll(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -long long int __dfso_strtoll(const char *nptr, char **endptr, int base, - dfsan_label nptr_label, dfsan_label endptr_label, - dfsan_label base_label, dfsan_label *ret_label, - dfsan_origin nptr_origin, - dfsan_origin endptr_origin, - dfsan_origin base_origin, - dfsan_origin *ret_origin) { - char *tmp_endptr; - long long int ret = dfsan_strtoll(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin, - ret_origin); - return ret; -} - -static unsigned long int dfsan_strtoul(const char *nptr, char **endptr, - int base, char **tmp_endptr) { - assert(tmp_endptr); - unsigned long int ret = strtoul(nptr, tmp_endptr, base); - if (endptr) - *endptr = *tmp_endptr; - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -unsigned long int __dfsw_strtoul(const char *nptr, char **endptr, int base, - dfsan_label nptr_label, dfsan_label endptr_label, - dfsan_label base_label, dfsan_label *ret_label) { - char *tmp_endptr; - unsigned long int ret = dfsan_strtoul(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -unsigned long int __dfso_strtoul( - const char *nptr, char **endptr, int base, dfsan_label nptr_label, - dfsan_label endptr_label, dfsan_label base_label, dfsan_label *ret_label, - dfsan_origin nptr_origin, dfsan_origin endptr_origin, - dfsan_origin base_origin, dfsan_origin *ret_origin) { - char *tmp_endptr; - unsigned long int ret = dfsan_strtoul(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin, - ret_origin); - return ret; -} - -static long long unsigned int dfsan_strtoull(const char *nptr, char **endptr, - int base, char **tmp_endptr) { - assert(tmp_endptr); - long long unsigned int ret = strtoull(nptr, tmp_endptr, base); - if (endptr) - *endptr = *tmp_endptr; - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -long long unsigned int __dfsw_strtoull(const char *nptr, char **endptr, - int base, dfsan_label nptr_label, - dfsan_label endptr_label, - dfsan_label base_label, - dfsan_label *ret_label) { - char *tmp_endptr; - long long unsigned int ret = dfsan_strtoull(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -long long unsigned int __dfso_strtoull( - const char *nptr, char **endptr, int base, dfsan_label nptr_label, - dfsan_label endptr_label, dfsan_label base_label, dfsan_label *ret_label, - dfsan_origin nptr_origin, dfsan_origin endptr_origin, - dfsan_origin base_origin, dfsan_origin *ret_origin) { - char *tmp_endptr; - long long unsigned int ret = dfsan_strtoull(nptr, endptr, base, &tmp_endptr); - dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); - dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin, - ret_origin); - return ret; -} +WRAPPER_ALIAS(__isoc23_strtod, strtod) + +#define WRAPPER_STRTO(ret_type, fun) \ + SANITIZER_INTERFACE_ATTRIBUTE ret_type __dfsw_##fun( \ + const char *nptr, char **endptr, int base, dfsan_label nptr_label, \ + dfsan_label endptr_label, dfsan_label base_label, \ + dfsan_label *ret_label) { \ + char *tmp_endptr; \ + auto ret = dfsan_strtol_impl(fun, nptr, endptr, base, &tmp_endptr); \ + dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); \ + return ret; \ + } \ + SANITIZER_INTERFACE_ATTRIBUTE ret_type __dfso_##fun( \ + const char *nptr, char **endptr, int base, dfsan_label nptr_label, \ + dfsan_label endptr_label, dfsan_label base_label, \ + dfsan_label *ret_label, dfsan_origin nptr_origin, \ + dfsan_origin endptr_origin, dfsan_origin base_origin, \ + dfsan_origin *ret_origin) { \ + char *tmp_endptr; \ + auto ret = dfsan_strtol_impl(fun, nptr, endptr, base, &tmp_endptr); \ + dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label); \ + dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, \ + base_origin, ret_origin); \ + return ret; \ + } + +WRAPPER_STRTO(long, strtol) +WRAPPER_STRTO(long long, strtoll) +WRAPPER_STRTO(unsigned long, strtoul) +WRAPPER_STRTO(unsigned long long, strtoull) +WRAPPER_ALIAS(__isoc23_strtol, strtol) +WRAPPER_ALIAS(__isoc23_strtoll, strtoll) +WRAPPER_ALIAS(__isoc23_strtoul, strtoul) +WRAPPER_ALIAS(__isoc23_strtoull, strtoull) SANITIZER_INTERFACE_ATTRIBUTE time_t __dfsw_time(time_t *t, dfsan_label t_label, dfsan_label *ret_label) { @@ -2231,7 +2147,7 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfso_write( *ret_label = 0; return write(fd, buf, count); } -} // namespace __dfsan +} // namespace __dfsan // Type used to extract a dfsan_label with va_arg() typedef int dfsan_label_va; @@ -2866,31 +2782,8 @@ int __dfso_sscanf(char *str, const char *format, dfsan_label str_label, return ret; } -SANITIZER_INTERFACE_ATTRIBUTE -int __dfsw___isoc99_sscanf(char *str, const char *format, dfsan_label str_label, - dfsan_label format_label, dfsan_label *va_labels, - dfsan_label *ret_label, ...) { - va_list ap; - va_start(ap, ret_label); - int ret = scan_buffer(str, ~0ul, format, va_labels, ret_label, nullptr, - nullptr, ap); - va_end(ap); - return ret; -} - -SANITIZER_INTERFACE_ATTRIBUTE -int __dfso___isoc99_sscanf(char *str, const char *format, dfsan_label str_label, - dfsan_label format_label, dfsan_label *va_labels, - dfsan_label *ret_label, dfsan_origin str_origin, - dfsan_origin format_origin, dfsan_origin *va_origins, - dfsan_origin *ret_origin, ...) { - va_list ap; - va_start(ap, ret_origin); - int ret = scan_buffer(str, ~0ul, format, va_labels, ret_label, &str_origin, - ret_origin, ap); - va_end(ap); - return ret; -} +WRAPPER_ALIAS(__isoc99_sscanf, sscanf) +WRAPPER_ALIAS(__isoc23_sscanf, sscanf) static void BeforeFork() { StackDepotLockBeforeFork(); diff --git a/compiler-rt/lib/dfsan/done_abilist.txt b/compiler-rt/lib/dfsan/done_abilist.txt index c582584d77e45f..86a42ee1b4dce8 100644 --- a/compiler-rt/lib/dfsan/done_abilist.txt +++ b/compiler-rt/lib/dfsan/done_abilist.txt @@ -270,6 +270,11 @@ fun:strtoul=custom fun:strtoull=custom fun:strcat=custom fun:strncat=custom +fun:__isoc23_strtod=custom +fun:__isoc23_strtol=custom +fun:__isoc23_strtoll=custom +fun:__isoc23_strtoul=custom +fun:__isoc23_strtoull=custom # Functions that produce an output that is computed from the input, but is not # necessarily data dependent. @@ -311,6 +316,7 @@ fun:snprintf=custom # scanf-like fun:sscanf=custom fun:__isoc99_sscanf=custom +fun:__isoc23_sscanf=custom # TODO: custom fun:asprintf=discard diff --git a/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt b/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt index 433092e2b27b8c..9ffa56a238185f 100644 --- a/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt +++ b/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt @@ -1,3 +1,8 @@ +fun:__isoc23_sscanf=uninstrumented +fun:__isoc23_strtol=uninstrumented +fun:__isoc23_strtoll=uninstrumented +fun:__isoc23_strtoul=uninstrumented +fun:__isoc23_strtoull=uninstrumented fun:_Exit=uninstrumented fun:_IO_adjust_column=uninstrumented fun:_IO_adjust_wcolumn=uninstrumented diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index 3cdf10c149902c..a2fc27de1901b4 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -467,7 +467,7 @@ void __msan_init() { __msan_clear_on_return(); if (__msan_get_track_origins()) VPrintf(1, "msan_track_origins\n"); - if (!InitShadow(__msan_get_track_origins())) { + if (!InitShadowWithReExec(__msan_get_track_origins())) { Printf("FATAL: MemorySanitizer can not mmap the shadow memory.\n"); Printf("FATAL: Make sure to compile with -fPIE and to link with -pie.\n"); Printf("FATAL: Disabling ASLR is known to cause this error.\n"); diff --git a/compiler-rt/lib/msan/msan.h b/compiler-rt/lib/msan/msan.h index 710447a3e1a357..7fb58be67a02cd 100644 --- a/compiler-rt/lib/msan/msan.h +++ b/compiler-rt/lib/msan/msan.h @@ -33,12 +33,18 @@ struct MappingDesc { uptr start; uptr end; enum Type { - INVALID, APP, SHADOW, ORIGIN + INVALID = 1, + ALLOCATOR = 2, + APP = 4, + SHADOW = 8, + ORIGIN = 16, } type; const char *name; }; - +// Note: MappingDesc::ALLOCATOR entries are only used to check for memory +// layout compatibility. The actual allocation settings are in +// msan_allocator.cpp, which need to be kept in sync. #if SANITIZER_LINUX && defined(__mips64) // MIPS64 maps: @@ -84,7 +90,8 @@ const MappingDesc kMemoryLayout[] = { {0X0B00000000000, 0X0C00000000000, MappingDesc::SHADOW, "shadow-10-13"}, {0X0C00000000000, 0X0D00000000000, MappingDesc::INVALID, "invalid"}, {0X0D00000000000, 0X0E00000000000, MappingDesc::ORIGIN, "origin-10-13"}, - {0X0E00000000000, 0X1000000000000, MappingDesc::APP, "app-15"}, + {0x0E00000000000, 0x0E40000000000, MappingDesc::ALLOCATOR, "allocator"}, + {0X0E40000000000, 0X1000000000000, MappingDesc::APP, "app-15"}, }; # define MEM_TO_SHADOW(mem) ((uptr)mem ^ 0xB00000000000ULL) # define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x200000000000ULL) @@ -106,7 +113,8 @@ const MappingDesc kMemoryLayout[] = { {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"}, {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"}, {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"}, - {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}}; + {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"}, + {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}}; # define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL) # define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x100000000000ULL) @@ -118,7 +126,8 @@ const MappingDesc kMemoryLayout[] = { {0x180200000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"}, {0x1C0000000000ULL, 0x2C0200000000ULL, MappingDesc::ORIGIN, "origin"}, {0x2C0200000000ULL, 0x300000000000ULL, MappingDesc::INVALID, "invalid"}, - {0x300000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}}; + {0x300000000000ULL, 0x320000000000ULL, MappingDesc::ALLOCATOR, "allocator"}, + {0x320000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}}; // Various kernels use different low end ranges but we can combine them into one // big range. They also use different high end ranges but we can map them all to @@ -141,7 +150,8 @@ const MappingDesc kMemoryLayout[] = { {0x180000000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"}, {0x1C0000000000ULL, 0x2C0000000000ULL, MappingDesc::ORIGIN, "origin"}, {0x2C0000000000ULL, 0x440000000000ULL, MappingDesc::INVALID, "invalid"}, - {0x440000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}}; + {0x440000000000ULL, 0x460000000000ULL, MappingDesc::ALLOCATOR, "allocator"}, + {0x460000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}}; #define MEM_TO_SHADOW(mem) \ ((((uptr)(mem)) & ~0xC00000000000ULL) + 0x080000000000ULL) @@ -208,7 +218,8 @@ const MappingDesc kMemoryLayout[] = { {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"}, {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"}, {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"}, - {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}}; + {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"}, + {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}}; #define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL) #define SHADOW_TO_ORIGIN(mem) (((uptr)(mem)) + 0x100000000000ULL) @@ -223,20 +234,22 @@ const uptr kMemoryLayoutSize = sizeof(kMemoryLayout) / sizeof(kMemoryLayout[0]); #ifndef __clang__ __attribute__((optimize("unroll-loops"))) #endif -inline bool addr_is_type(uptr addr, MappingDesc::Type mapping_type) { +inline bool +addr_is_type(uptr addr, int mapping_types) { // It is critical for performance that this loop is unrolled (because then it is // simplified into just a few constant comparisons). #ifdef __clang__ #pragma unroll #endif for (unsigned i = 0; i < kMemoryLayoutSize; ++i) - if (kMemoryLayout[i].type == mapping_type && + if ((kMemoryLayout[i].type & mapping_types) && addr >= kMemoryLayout[i].start && addr < kMemoryLayout[i].end) return true; return false; } -#define MEM_IS_APP(mem) addr_is_type((uptr)(mem), MappingDesc::APP) +#define MEM_IS_APP(mem) \ + (addr_is_type((uptr)(mem), MappingDesc::APP | MappingDesc::ALLOCATOR)) #define MEM_IS_SHADOW(mem) addr_is_type((uptr)(mem), MappingDesc::SHADOW) #define MEM_IS_ORIGIN(mem) addr_is_type((uptr)(mem), MappingDesc::ORIGIN) @@ -250,7 +263,7 @@ extern bool msan_init_is_running; extern int msan_report_count; bool ProtectRange(uptr beg, uptr end); -bool InitShadow(bool init_origins); +bool InitShadowWithReExec(bool init_origins); char *GetProcSelfMaps(); void InitializeInterceptors(); diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp index 0b2dd2b2f1883d..b1bc5b9390f75b 100644 --- a/compiler-rt/lib/msan/msan_allocator.cpp +++ b/compiler-rt/lib/msan/msan_allocator.cpp @@ -48,6 +48,9 @@ struct MsanMapUnmapCallback { } }; +// Note: to ensure that the allocator is compatible with the application memory +// layout (especially with high-entropy ASLR), kSpaceBeg and kSpaceSize must be +// duplicated as MappingDesc::ALLOCATOR in msan.h. #if defined(__mips64) static const uptr kMaxAllowedMallocSize = 2UL << 30; diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp index c7ecb7cad56661..cd2d9f5c720c57 100644 --- a/compiler-rt/lib/msan/msan_linux.cpp +++ b/compiler-rt/lib/msan/msan_linux.cpp @@ -20,6 +20,9 @@ # include # include # include +# if SANITIZER_LINUX +# include +# endif # include # include # include @@ -43,11 +46,13 @@ void ReportMapRange(const char *descr, uptr beg, uptr size) { } } -static bool CheckMemoryRangeAvailability(uptr beg, uptr size) { +static bool CheckMemoryRangeAvailability(uptr beg, uptr size, bool verbose) { if (size > 0) { uptr end = beg + size - 1; if (!MemoryRangeIsAvailable(beg, end)) { - Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg, end); + if (verbose) + Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg, + end); return false; } } @@ -86,7 +91,7 @@ static void CheckMemoryLayoutSanity() { CHECK(addr_is_type(start, type)); CHECK(addr_is_type((start + end) / 2, type)); CHECK(addr_is_type(end - 1, type)); - if (type == MappingDesc::APP) { + if (type == MappingDesc::APP || type == MappingDesc::ALLOCATOR) { uptr addr = start; CHECK(MEM_IS_SHADOW(MEM_TO_SHADOW(addr))); CHECK(MEM_IS_ORIGIN(MEM_TO_ORIGIN(addr))); @@ -106,7 +111,7 @@ static void CheckMemoryLayoutSanity() { } } -bool InitShadow(bool init_origins) { +static bool InitShadow(bool init_origins, bool dry_run) { // Let user know mapping parameters first. VPrintf(1, "__msan_init %p\n", reinterpret_cast(&__msan_init)); for (unsigned i = 0; i < kMemoryLayoutSize; ++i) @@ -116,8 +121,9 @@ bool InitShadow(bool init_origins) { CheckMemoryLayoutSanity(); if (!MEM_IS_APP(&__msan_init)) { - Printf("FATAL: Code %p is out of application range. Non-PIE build?\n", - reinterpret_cast(&__msan_init)); + if (!dry_run) + Printf("FATAL: Code %p is out of application range. Non-PIE build?\n", + reinterpret_cast(&__msan_init)); return false; } @@ -138,20 +144,26 @@ bool InitShadow(bool init_origins) { bool protect = type == MappingDesc::INVALID || (!init_origins && type == MappingDesc::ORIGIN); CHECK(!(map && protect)); - if (!map && !protect) - CHECK(type == MappingDesc::APP); + if (!map && !protect) { + CHECK(type == MappingDesc::APP || type == MappingDesc::ALLOCATOR); + + if (dry_run && type == MappingDesc::ALLOCATOR && + !CheckMemoryRangeAvailability(start, size, !dry_run)) + return false; + } if (map) { - if (!CheckMemoryRangeAvailability(start, size)) + if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run)) return false; - if (!MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name)) + if (!dry_run && + !MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name)) return false; - if (common_flags()->use_madv_dontdump) + if (!dry_run && common_flags()->use_madv_dontdump) DontDumpShadowMemory(start, size); } if (protect) { - if (!CheckMemoryRangeAvailability(start, size)) + if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run)) return false; - if (!ProtectMemoryRange(start, size, kMemoryLayout[i].name)) + if (!dry_run && !ProtectMemoryRange(start, size, kMemoryLayout[i].name)) return false; } } @@ -159,6 +171,35 @@ bool InitShadow(bool init_origins) { return true; } +bool InitShadowWithReExec(bool init_origins) { + // Start with dry run: check layout is ok, but don't print warnings because + // warning messages will cause tests to fail (even if we successfully re-exec + // after the warning). + bool success = InitShadow(__msan_get_track_origins(), true); + if (!success) { +# if SANITIZER_LINUX + // Perhaps ASLR entropy is too high. If ASLR is enabled, re-exec without it. + int old_personality = personality(0xffffffff); + bool aslr_on = + (old_personality != -1) && ((old_personality & ADDR_NO_RANDOMIZE) == 0); + + if (aslr_on) { + VReport(1, + "WARNING: MemorySanitizer: memory layout is incompatible, " + "possibly due to high-entropy ASLR.\n" + "Re-execing with fixed virtual address space.\n" + "N.B. reducing ASLR entropy is preferable.\n"); + CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1); + ReExec(); + } +# endif + } + + // The earlier dry run didn't actually map or protect anything. Run again in + // non-dry run mode. + return success && InitShadow(__msan_get_track_origins(), false); +} + static void MsanAtExit(void) { if (flags()->print_stats && (flags()->atexit || msan_report_count > 0)) ReportStats(); diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 867ae73f0d3b27..f3b457d786e6bd 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -677,6 +677,7 @@ static void initializeProfileForContinuousMode(void) { PROF_ERR("Continuous counter sync mode is enabled, but raw profile is not" "page-aligned. CurrentFileOffset = %" PRIu64 ", pagesz = %u.\n", (uint64_t)CurrentFileOffset, PageSize); + fclose(File); return; } if (writeProfileWithFileObject(Filename, File) != 0) { @@ -692,6 +693,8 @@ static void initializeProfileForContinuousMode(void) { if (doMerging()) { lprofUnlockFileHandle(File); + } + if (File != NULL) { fclose(File); } } diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c index 9f46a98d78ac4e..002bec164d7e85 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c @@ -195,6 +195,8 @@ static const int dummy_name[0] COMPILER_RT_SECTION( COMPILER_RT_SEG INSTR_PROF_NAME_SECT_NAME); static int dummy_vnds[0] COMPILER_RT_SECTION( COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME); +static int dummy_orderfile[0] COMPILER_RT_SECTION( + COMPILER_RT_SEG INSTR_PROF_ORDERFILE_SECT_NAME); // To avoid GC'ing of the dummy variables by the linker, reference them in an // array and reference the array in the runtime registration code @@ -206,7 +208,7 @@ static int dummy_vnds[0] COMPILER_RT_SECTION( COMPILER_RT_VISIBILITY void *__llvm_profile_keep[] = {(void *)&dummy_cnts, (void *)&dummy_bits, (void *)&dummy_data, (void *)&dummy_name, - (void *)&dummy_vnds}; + (void *)&dummy_vnds, (void *)&dummy_orderfile}; #ifdef __GNUC__ #pragma GCC diagnostic pop #endif diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c index c976776ae59e9c..0751b28f81d0ac 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c @@ -77,7 +77,7 @@ ValueProfNode *EndVNode = &VNodesEnd; /* lld-link provides __buildid symbol which ponits to the 16 bytes build id when * using /build-id flag. https://lld.llvm.org/windows_support.html#lld-flags */ #define BUILD_ID_LEN 16 -COMPILER_RT_WEAK extern uint8_t __buildid[BUILD_ID_LEN]; +COMPILER_RT_WEAK uint8_t __buildid[BUILD_ID_LEN]; COMPILER_RT_VISIBILITY int __llvm_write_binary_ids(ProfDataWriter *Writer) { if (*__buildid) { if (Writer && diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index 1b56bebac64e68..3ecdb55cdbf72f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -10218,20 +10218,6 @@ INTERCEPTOR(int, __xuname, int size, void *utsname) { #define INIT___XUNAME #endif -#if SANITIZER_INTERCEPT_HEXDUMP -INTERCEPTOR(void, hexdump, const void *ptr, int length, const char *header, int flags) { - void *ctx; - COMMON_INTERCEPTOR_ENTER(ctx, hexdump, ptr, length, header, flags); - COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, length); - COMMON_INTERCEPTOR_READ_RANGE(ctx, header, internal_strlen(header) + 1); - REAL(hexdump)(ptr, length, header, flags); -} - -#define INIT_HEXDUMP COMMON_INTERCEPT_FUNCTION(hexdump); -#else -#define INIT_HEXDUMP -#endif - #if SANITIZER_INTERCEPT_ARGP_PARSE INTERCEPTOR(int, argp_parse, const struct argp *argp, int argc, char **argv, unsigned flags, int *arg_index, void *input) { @@ -10581,7 +10567,6 @@ static void InitializeCommonInterceptors() { INIT_PROCCTL INIT_UNAME; INIT___XUNAME; - INIT_HEXDUMP; INIT_ARGP_PARSE; INIT_CPUSET_GETAFFINITY; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 0ce4e9351bc1da..de55c736d0e144 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -596,7 +596,6 @@ #define SANITIZER_INTERCEPT___XUNAME SI_FREEBSD #define SANITIZER_INTERCEPT_FLOPEN SI_FREEBSD #define SANITIZER_INTERCEPT_PROCCTL SI_FREEBSD -#define SANITIZER_INTERCEPT_HEXDUMP SI_FREEBSD #define SANITIZER_INTERCEPT_ARGP_PARSE SI_GLIBC #define SANITIZER_INTERCEPT_CPUSET_GETAFFINITY SI_FREEBSD diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp index 8438e019591b58..f6b157c07c6557 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp @@ -34,8 +34,10 @@ static bool FrameIsInternal(const SymbolizedStack *frame) { return true; const char *file = frame->info.file; const char *module = frame->info.module; + // On Gentoo, the path is g++-*, so there's *not* a missing /. if (file && (internal_strstr(file, "/compiler-rt/lib/") || - internal_strstr(file, "/include/c++/"))) + internal_strstr(file, "/include/c++/") || + internal_strstr(file, "/include/g++"))) return true; if (module && (internal_strstr(module, "libclang_rt."))) return true; diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp index a9f6673ac44e90..d0282c27043125 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp @@ -14,6 +14,7 @@ #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_errno.h" +#include "sanitizer_common/sanitizer_glibc_version.h" #include "sanitizer_common/sanitizer_libc.h" #include "sanitizer_common/sanitizer_linux.h" #include "sanitizer_common/sanitizer_platform_limits_netbsd.h" @@ -1613,47 +1614,40 @@ TSAN_INTERCEPTOR(int, __fxstat, int version, int fd, void *buf) { FdAccess(thr, pc, fd); return REAL(__fxstat)(version, fd, buf); } -#define TSAN_MAYBE_INTERCEPT___FXSTAT TSAN_INTERCEPT(__fxstat) + +TSAN_INTERCEPTOR(int, __fxstat64, int version, int fd, void *buf) { + SCOPED_TSAN_INTERCEPTOR(__fxstat64, version, fd, buf); + if (fd > 0) + FdAccess(thr, pc, fd); + return REAL(__fxstat64)(version, fd, buf); +} +#define TSAN_MAYBE_INTERCEPT___FXSTAT TSAN_INTERCEPT(__fxstat); TSAN_INTERCEPT(__fxstat64) #else #define TSAN_MAYBE_INTERCEPT___FXSTAT #endif +#if !SANITIZER_GLIBC || __GLIBC_PREREQ(2, 33) TSAN_INTERCEPTOR(int, fstat, int fd, void *buf) { -#if SANITIZER_GLIBC - SCOPED_TSAN_INTERCEPTOR(__fxstat, 0, fd, buf); - if (fd > 0) - FdAccess(thr, pc, fd); - return REAL(__fxstat)(0, fd, buf); -#else SCOPED_TSAN_INTERCEPTOR(fstat, fd, buf); if (fd > 0) FdAccess(thr, pc, fd); return REAL(fstat)(fd, buf); -#endif -} - -#if SANITIZER_GLIBC -TSAN_INTERCEPTOR(int, __fxstat64, int version, int fd, void *buf) { - SCOPED_TSAN_INTERCEPTOR(__fxstat64, version, fd, buf); - if (fd > 0) - FdAccess(thr, pc, fd); - return REAL(__fxstat64)(version, fd, buf); } -#define TSAN_MAYBE_INTERCEPT___FXSTAT64 TSAN_INTERCEPT(__fxstat64) +# define TSAN_MAYBE_INTERCEPT_FSTAT TSAN_INTERCEPT(fstat) #else -#define TSAN_MAYBE_INTERCEPT___FXSTAT64 +# define TSAN_MAYBE_INTERCEPT_FSTAT #endif -#if SANITIZER_GLIBC +#if __GLIBC_PREREQ(2, 33) TSAN_INTERCEPTOR(int, fstat64, int fd, void *buf) { - SCOPED_TSAN_INTERCEPTOR(__fxstat64, 0, fd, buf); + SCOPED_TSAN_INTERCEPTOR(fstat64, fd, buf); if (fd > 0) FdAccess(thr, pc, fd); - return REAL(__fxstat64)(0, fd, buf); + return REAL(fstat64)(fd, buf); } -#define TSAN_MAYBE_INTERCEPT_FSTAT64 TSAN_INTERCEPT(fstat64) +# define TSAN_MAYBE_INTERCEPT_FSTAT64 TSAN_INTERCEPT(fstat64) #else -#define TSAN_MAYBE_INTERCEPT_FSTAT64 +# define TSAN_MAYBE_INTERCEPT_FSTAT64 #endif TSAN_INTERCEPTOR(int, open, const char *name, int oflag, ...) { @@ -2950,10 +2944,9 @@ void InitializeInterceptors() { TSAN_INTERCEPT(pthread_once); - TSAN_INTERCEPT(fstat); TSAN_MAYBE_INTERCEPT___FXSTAT; + TSAN_MAYBE_INTERCEPT_FSTAT; TSAN_MAYBE_INTERCEPT_FSTAT64; - TSAN_MAYBE_INTERCEPT___FXSTAT64; TSAN_INTERCEPT(open); TSAN_MAYBE_INTERCEPT_OPEN64; TSAN_INTERCEPT(creat); diff --git a/compiler-rt/test/profile/AIX/bexpfull-pgo.c b/compiler-rt/test/profile/AIX/bexpfull-pgo.c new file mode 100644 index 00000000000000..f48242ec6bfeaa --- /dev/null +++ b/compiler-rt/test/profile/AIX/bexpfull-pgo.c @@ -0,0 +1,7 @@ +// RUN: %clang_pgogen %s -bexpall +// RUN: %clang_pgogen %s -bexpfull + +#include +int ar[10]; +int n; +int main() { memcpy(ar, ar + 1, n); }; diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/hexdump.cc b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/hexdump.cc deleted file mode 100644 index e07650d64102a4..00000000000000 --- a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/hexdump.cc +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: %clangxx -O0 -g %s -o %t -lutil && %run %t 2>&1 | FileCheck %s - -#include -#include -#include -#include - -int main(void) { - printf("hexdump"); - char *line; - size_t lineno = 0, len; - const char *delim = "\\\\#"; - FILE *fp = fopen("/etc/fstab", "r"); - assert(fp); - line = fparseln(fp, &len, &lineno, delim, 0); - hexdump(line, len, nullptr, 0); - free(line); - fclose(fp); - assert(lineno != 0); - assert(len > 0); - - return 0; -} diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp index 1ac04b53491e14..1d1fbf7299e8b4 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp @@ -3,7 +3,9 @@ // REQUIRES: has_sancovcc,stable-runtime,linux,x86_64-target-arch -// RUN: %clangxx -O0 %s -fsanitize-coverage=inline-8bit-counters,pc-table -o %t +/// In glibc 2.39+, fprintf has a nonnull attribute. Disable nonnull-attribute, +/// which would increase counters for ubsan. +// RUN: %clangxx -O0 %s -fsanitize-coverage=inline-8bit-counters,pc-table -fno-sanitize=nonnull-attribute -o %t // RUN: rm -f %t-counters %t-pcs // RUN: env %tool_options="cov_8bit_counters_out=%t-counters cov_pcs_out=%t-pcs verbosity=1" %run %t 2>&1 | FileCheck %s diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp index daa994c8116251..b168954a1c92cf 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp @@ -7,7 +7,9 @@ // RUN: rm -rf $DIR // RUN: mkdir -p $DIR // RUN: cd $DIR -// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard %s -o %t +/// In glibc 2.39+, fprintf has a nonnull attribute. Disable nonnull-attribute, +/// which would increase counters for ubsan. +// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard -fno-sanitize=nonnull-attribute %s -o %t // RUN: %env_tool_opts=coverage=1 %t 2>&1 | FileCheck %s // RUN: rm -rf $DIR diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl new file mode 100644 index 00000000000000..867273126f8fdf --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl @@ -0,0 +1,107 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); + +// Wave32 + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32: +// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32: +// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false); +} diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl new file mode 100644 index 00000000000000..ad01450c35a76f --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl @@ -0,0 +1,104 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// Wave64 + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64: +// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64: +// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) +{ + *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64: +// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c) +{ + *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64: +// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32: +// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +// +void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c) +{ + *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0] +// +void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c) +{ + *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false); +} diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl new file mode 100644 index 00000000000000..317d9a1102ccff --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl @@ -0,0 +1,110 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v8f __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef int v8i __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef short v16s __attribute__((ext_vector_type(16))); + +// Wave32 + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32: +// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32: +// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32: +// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index); +} diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl new file mode 100644 index 00000000000000..eb81234f53a663 --- /dev/null +++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl @@ -0,0 +1,109 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +// Wave64 + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64: +// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index) +{ + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64: +// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index) +{ + *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64: +// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp +// +void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index) +{ + *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index); +} + + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index); +} + +// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: +// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} +// +void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index) +{ + *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index); +} diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 5ade2574032977..5ad6d01e8a8ed6 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -852,13 +852,13 @@ used in constant expressions have currently no folding support at all. - **Syntax:** `CALL EXECUTE_COMMAND_LINE(COMMAND [, WAIT, EXITSTAT, CMDSTAT, CMDMSG ])` - **Arguments:** - | Argument | Description | - |-----------|--------------------------------------------------------------| - | `COMMAND` | Shall be a default CHARACTER scalar. | - | `WAIT` | (Optional) Shall be a default LOGICAL scalar. | - | `EXITSTAT`| (Optional) Shall be an INTEGER of the default kind. | - | `CMDSTAT` | (Optional) Shall be an INTEGER of the default kind. | - | `CMDMSG` | (Optional) Shall be a CHARACTER scalar of the default kind. | +| Argument | Description | +|------------|-----------------------------------------------------------------------| +| `COMMAND` | Shall be a default CHARACTER scalar. | +| `WAIT` | (Optional) Shall be a default LOGICAL scalar. | +| `EXITSTAT` | (Optional) Shall be an INTEGER with kind greater than or equal to 4. | +| `CMDSTAT` | (Optional) Shall be an INTEGER with kind greater than or equal to 2. | +| `CMDMSG` | (Optional) Shall be a CHARACTER scalar of the default kind. | #### Implementation Specifics diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index da6d5970089884..1701a475942ff5 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -78,6 +78,8 @@ static constexpr CategorySet AnyType{IntrinsicType | DerivedType}; ENUM_CLASS(KindCode, none, defaultIntegerKind, defaultRealKind, // is also the default COMPLEX kind doublePrecision, defaultCharKind, defaultLogicalKind, + greaterOrEqualToKind, // match kind value greater than or equal to a single + // explicit kind value any, // matches any kind value; each instance is independent // match any kind, but all "same" kinds must be equal. For characters, also // implies that lengths must be equal. @@ -104,7 +106,7 @@ ENUM_CLASS(KindCode, none, defaultIntegerKind, struct TypePattern { CategorySet categorySet; KindCode kindCode{KindCode::none}; - int exactKindValue{0}; // for KindCode::exactKind + int kindValue{0}; // for KindCode::exactKind and greaterOrEqualToKind llvm::raw_ostream &Dump(llvm::raw_ostream &) const; }; @@ -1314,10 +1316,11 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {"execute_command_line", {{"command", DefaultChar, Rank::scalar}, {"wait", AnyLogical, Rank::scalar, Optionality::optional}, - {"exitstat", AnyInt, Rank::scalar, Optionality::optional, - common::Intent::InOut}, - {"cmdstat", AnyInt, Rank::scalar, Optionality::optional, - common::Intent::Out}, + {"exitstat", + TypePattern{IntType, KindCode::greaterOrEqualToKind, 4}, + Rank::scalar, Optionality::optional, common::Intent::InOut}, + {"cmdstat", TypePattern{IntType, KindCode::greaterOrEqualToKind, 2}, + Rank::scalar, Optionality::optional, common::Intent::Out}, {"cmdmsg", DefaultChar, Rank::scalar, Optionality::optional, common::Intent::InOut}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, @@ -1834,7 +1837,10 @@ std::optional IntrinsicInterface::Match( argOk = true; break; case KindCode::exactKind: - argOk = type->kind() == d.typePattern.exactKindValue; + argOk = type->kind() == d.typePattern.kindValue; + break; + case KindCode::greaterOrEqualToKind: + argOk = type->kind() >= d.typePattern.kindValue; break; case KindCode::sameAtom: if (!sameArg) { @@ -2177,8 +2183,9 @@ std::optional IntrinsicInterface::Match( resultType = DynamicType{ GetBuiltinDerivedType(builtinsScope, "__builtin_team_type")}; break; + case KindCode::greaterOrEqualToKind: case KindCode::exactKind: - resultType = DynamicType{*category, result.exactKindValue}; + resultType = DynamicType{*category, result.kindValue}; break; case KindCode::typeless: case KindCode::any: diff --git a/flang/test/Semantics/execute_command_line.f90 b/flang/test/Semantics/execute_command_line.f90 new file mode 100644 index 00000000000000..a66bbce705715d --- /dev/null +++ b/flang/test/Semantics/execute_command_line.f90 @@ -0,0 +1,29 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic +! Tests for the EXECUTE_COMMAND_LINE intrinsics + +subroutine bad_kind_error(command, exitVal, cmdVal) +CHARACTER(30) :: command +INTEGER(KIND=2) :: exitVal +INTEGER(KIND=1) :: cmdVal +!ERROR: Actual argument for 'exitstat=' has bad type or kind 'INTEGER(2)' +call execute_command_line(command, exitstat=exitVal) + +!ERROR: Actual argument for 'cmdstat=' has bad type or kind 'INTEGER(1)' +call execute_command_line(command, cmdstat=cmdVal) +end subroutine bad_kind_error + +subroutine good_kind_equal(command, exitVal, cmdVal) +CHARACTER(30) :: command +INTEGER(KIND=4) :: exitVal +INTEGER(KIND=2) :: cmdVal +call execute_command_line(command, exitstat=exitVal) +call execute_command_line(command, cmdstat=cmdVal) +end subroutine good_kind_equal + +subroutine good_kind_greater(command, exitVal, cmdVal) +CHARACTER(30) :: command +INTEGER(KIND=8) :: exitVal +INTEGER(KIND=4) :: cmdVal +call execute_command_line(command, exitstat=exitVal) +call execute_command_line(command, cmdstat=cmdVal) +end subroutine good_kind_greater diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst index 533c3fbd2a1eea..ee2b81d3b9e7ca 100644 --- a/libcxx/docs/Modules.rst +++ b/libcxx/docs/Modules.rst @@ -218,9 +218,13 @@ Building this project is done with the following steps, assuming the files $ mkdir build $ cmake -G Ninja -S . -B build -DCMAKE_CXX_COMPILER= -DLIBCXX_BUILD= + $ ninja -j1 std -C build $ ninja -C build $ build/main +.. note:: The ``std`` dependencies of ``std.compat`` is not always resolved when + building the ``std`` target using multiple jobs. + .. warning:: ```` should point point to the real binary and not to a symlink. diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst index fb3d2af544c287..7ea13e6943dd4c 100644 --- a/libcxx/docs/ReleaseNotes/18.rst +++ b/libcxx/docs/ReleaseNotes/18.rst @@ -125,6 +125,8 @@ Improvements and New Features ``${PREFIX}/share/libc++/v1``. - AddressSanitizer annotations have been added to ``std::basic_string``. + These annotations are enabled for all allocators by default. + It's only enabled for long strings, strings using the small buffer optimization are not annotated. - The libc++ source code has been formatted with ``clang-format``. This `discourse thread `_ @@ -275,11 +277,10 @@ ABI Affecting Changes results in an ABI break, however in practice we expect uses of ``std::projected`` in ABI-sensitive places to be extremely rare. Any error resulting from this change should result in a link-time error. -- Under the unstable ABI, the internal alignment requirements for heap allocations - inside ``std::string`` has decreased from 16 to 8. This saves memory since string requests fewer additional - bytes than it did previously. However, this also changes the return value of ``std::string::max_size`` - and can cause code compiled against older libc++ versions but linked at runtime to a new version - to throw a different exception when attempting allocations that are too large +- The internal alignment requirements for heap allocations inside ``std::string`` has decreased from 16 to 8. This + saves memory since string requests fewer additional bytes than it did previously. However, this also changes the + return value of ``std::string::max_size`` and can cause code compiled against older libc++ versions but linked at + runtime to a new version to throw a different exception when attempting allocations that are too large (``std::bad_alloc`` vs ``std::length_error``). - The layout of some range adaptors that use the ``movable-box`` exposition-only type as an implementation @@ -342,3 +343,6 @@ Build System Changes them so that they can interoperate with other system-provided libraries that might be using a different unwinding library (such as ``libgcc_s``), you should pass ``LIBCXXABI_USE_LLVM_UNWINDER=OFF`` and ``COMPILER_RT_USE_LLVM_UNWINDER=OFF`` to make sure the system-provided unwinding library is used by the LLVM runtimes. + +- In Clang-cl configurations, libc++ can now be built against the static and/or + debug MSVC C runtimes diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h index b350507e32bae9..0fc7a5e3cee700 100644 --- a/libcxx/include/__algorithm/copy_move_common.h +++ b/libcxx/include/__algorithm/copy_move_common.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // Type traits. @@ -132,4 +135,6 @@ __dispatch_copy_or_move(_InIter __first, _Sent __last, _OutIter __out_first) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_COPY_MOVE_COMMON_H diff --git a/libcxx/include/__algorithm/equal.h b/libcxx/include/__algorithm/equal.h index f03f010aa51ab3..3c0e3060e39a99 100644 --- a/libcxx/include/__algorithm/equal.h +++ b/libcxx/include/__algorithm/equal.h @@ -30,6 +30,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -162,4 +165,6 @@ equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_EQUAL_H diff --git a/libcxx/include/__algorithm/equal_range.h b/libcxx/include/__algorithm/equal_range.h index 7ce54965fff05f..a94290431971c4 100644 --- a/libcxx/include/__algorithm/equal_range.h +++ b/libcxx/include/__algorithm/equal_range.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -77,4 +80,6 @@ equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_EQUAL_RANGE_H diff --git a/libcxx/include/__algorithm/fold.h b/libcxx/include/__algorithm/fold.h index 88e6814d5cf99d..1a9d76b50d83c9 100644 --- a/libcxx/include/__algorithm/fold.h +++ b/libcxx/include/__algorithm/fold.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 23 @@ -122,4 +125,6 @@ inline constexpr auto fold_left = __fold_left(); _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_FOLD_H diff --git a/libcxx/include/__algorithm/in_found_result.h b/libcxx/include/__algorithm/in_found_result.h index 88a0255d169831..a67ae387974c0a 100644 --- a/libcxx/include/__algorithm/in_found_result.h +++ b/libcxx/include/__algorithm/in_found_result.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -46,4 +49,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_IN_FOUND_RESULT_H diff --git a/libcxx/include/__algorithm/in_fun_result.h b/libcxx/include/__algorithm/in_fun_result.h index 6110c1cf86cd52..a22069a9a8ddaa 100644 --- a/libcxx/include/__algorithm/in_fun_result.h +++ b/libcxx/include/__algorithm/in_fun_result.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -46,4 +49,6 @@ struct in_fun_result { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_IN_FUN_RESULT_H diff --git a/libcxx/include/__algorithm/in_in_out_result.h b/libcxx/include/__algorithm/in_in_out_result.h index 95ce4f4fd5bd44..ba0380b5c68147 100644 --- a/libcxx/include/__algorithm/in_in_out_result.h +++ b/libcxx/include/__algorithm/in_in_out_result.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -51,4 +54,6 @@ struct in_in_out_result { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_IN_IN_OUT_RESULT_H diff --git a/libcxx/include/__algorithm/in_in_result.h b/libcxx/include/__algorithm/in_in_result.h index d1d62dae7f6703..994573fc70fd88 100644 --- a/libcxx/include/__algorithm/in_in_result.h +++ b/libcxx/include/__algorithm/in_in_result.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -48,4 +51,6 @@ struct in_in_result { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_IN_IN_RESULT_H diff --git a/libcxx/include/__algorithm/in_out_out_result.h b/libcxx/include/__algorithm/in_out_out_result.h index 14364236875086..8ceb452841a419 100644 --- a/libcxx/include/__algorithm/in_out_out_result.h +++ b/libcxx/include/__algorithm/in_out_out_result.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -49,4 +52,6 @@ struct in_out_out_result { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_IN_OUT_OUT_RESULT_H diff --git a/libcxx/include/__algorithm/includes.h b/libcxx/include/__algorithm/includes.h index 531752e9317569..05d45365eb806f 100644 --- a/libcxx/include/__algorithm/includes.h +++ b/libcxx/include/__algorithm/includes.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -71,4 +74,6 @@ includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_INCLUDES_H diff --git a/libcxx/include/__algorithm/next_permutation.h b/libcxx/include/__algorithm/next_permutation.h index d66ea9b973453f..011ee028cc2f52 100644 --- a/libcxx/include/__algorithm/next_permutation.h +++ b/libcxx/include/__algorithm/next_permutation.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -67,4 +70,6 @@ next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last) _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_NEXT_PERMUTATION_H diff --git a/libcxx/include/__algorithm/nth_element.h b/libcxx/include/__algorithm/nth_element.h index 37ddfbdacf044d..da748d7255aba6 100644 --- a/libcxx/include/__algorithm/nth_element.h +++ b/libcxx/include/__algorithm/nth_element.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -253,4 +256,6 @@ nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomA _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_NTH_ELEMENT_H diff --git a/libcxx/include/__algorithm/partial_sort.h b/libcxx/include/__algorithm/partial_sort.h index 27511a124229bb..85a8fdc77aa228 100644 --- a/libcxx/include/__algorithm/partial_sort.h +++ b/libcxx/include/__algorithm/partial_sort.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -83,4 +86,6 @@ partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Ran _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PARTIAL_SORT_H diff --git a/libcxx/include/__algorithm/partial_sort_copy.h b/libcxx/include/__algorithm/partial_sort_copy.h index e7d8df4de89f95..ef7c9d34d94983 100644 --- a/libcxx/include/__algorithm/partial_sort_copy.h +++ b/libcxx/include/__algorithm/partial_sort_copy.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -82,4 +85,6 @@ partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PARTITION_H diff --git a/libcxx/include/__algorithm/prev_permutation.h b/libcxx/include/__algorithm/prev_permutation.h index 3e4bbb3fbb1670..8d15b6806401d8 100644 --- a/libcxx/include/__algorithm/prev_permutation.h +++ b/libcxx/include/__algorithm/prev_permutation.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -67,4 +70,6 @@ prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last) _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PREV_PERMUTATION_H diff --git a/libcxx/include/__algorithm/pstl_any_all_none_of.h b/libcxx/include/__algorithm/pstl_any_all_none_of.h index d93fdba2224c9b..4b1e0e61b54218 100644 --- a/libcxx/include/__algorithm/pstl_any_all_none_of.h +++ b/libcxx/include/__algorithm/pstl_any_all_none_of.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -144,4 +147,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_ANY_ALL_NONE_OF_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h index ab2e3172b8b63b..14a0d76741d4c5 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -194,4 +197,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h index 19f275a0d5d97f..1069dcec0e117a 100644 --- a/libcxx/include/__algorithm/pstl_copy.h +++ b/libcxx/include/__algorithm/pstl_copy.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -113,4 +116,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_COPY_H diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h index 28806fca063701..2781f6bfd3c9e0 100644 --- a/libcxx/include/__algorithm/pstl_count.h +++ b/libcxx/include/__algorithm/pstl_count.h @@ -29,6 +29,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -113,4 +116,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_COUNT_H diff --git a/libcxx/include/__algorithm/pstl_equal.h b/libcxx/include/__algorithm/pstl_equal.h index b343d2675980c9..d235c0f4f41972 100644 --- a/libcxx/include/__algorithm/pstl_equal.h +++ b/libcxx/include/__algorithm/pstl_equal.h @@ -21,6 +21,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -167,4 +170,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_EQUAL_H diff --git a/libcxx/include/__algorithm/pstl_fill.h b/libcxx/include/__algorithm/pstl_fill.h index 3057dcc04f1ad7..488b49a0feec96 100644 --- a/libcxx/include/__algorithm/pstl_fill.h +++ b/libcxx/include/__algorithm/pstl_fill.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -108,4 +111,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_FILL_H diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h index adc05ea1a9e55a..5b694db68aead4 100644 --- a/libcxx/include/__algorithm/pstl_find.h +++ b/libcxx/include/__algorithm/pstl_find.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -133,4 +136,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_FIND_H diff --git a/libcxx/include/__algorithm/pstl_for_each.h b/libcxx/include/__algorithm/pstl_for_each.h index 819a43d685abed..bb7b5a61a6dc0d 100644 --- a/libcxx/include/__algorithm/pstl_for_each.h +++ b/libcxx/include/__algorithm/pstl_for_each.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -100,4 +103,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_FOR_EACH_H diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h index 56538392d5b5dd..7133c6f4f4c621 100644 --- a/libcxx/include/__algorithm/pstl_generate.h +++ b/libcxx/include/__algorithm/pstl_generate.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -106,4 +109,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_GENERATE_H diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h index 39cf6369339db6..b6543021220727 100644 --- a/libcxx/include/__algorithm/pstl_is_partitioned.h +++ b/libcxx/include/__algorithm/pstl_is_partitioned.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -69,4 +72,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_IS_PARITTIONED diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h index ed801451086326..3d262db6bc0c15 100644 --- a/libcxx/include/__algorithm/pstl_merge.h +++ b/libcxx/include/__algorithm/pstl_merge.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -84,4 +87,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_MERGE_H diff --git a/libcxx/include/__algorithm/pstl_move.h b/libcxx/include/__algorithm/pstl_move.h index 52baab57591e26..d8441f1a6c2e16 100644 --- a/libcxx/include/__algorithm/pstl_move.h +++ b/libcxx/include/__algorithm/pstl_move.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_MOVE_H diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h index 05dee3f6a4f30c..b1caf3fd4ac0a1 100644 --- a/libcxx/include/__algorithm/pstl_replace.h +++ b/libcxx/include/__algorithm/pstl_replace.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -239,4 +242,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_REPLACE_H diff --git a/libcxx/include/__algorithm/pstl_rotate_copy.h b/libcxx/include/__algorithm/pstl_rotate_copy.h index 33dc9a3635f7e8..346aab1d4a55c0 100644 --- a/libcxx/include/__algorithm/pstl_rotate_copy.h +++ b/libcxx/include/__algorithm/pstl_rotate_copy.h @@ -19,6 +19,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_ROTATE_COPY_H diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h index 3e71e0aa5ae0a1..a931f768111a23 100644 --- a/libcxx/include/__algorithm/pstl_sort.h +++ b/libcxx/include/__algorithm/pstl_sort.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_SORT_H diff --git a/libcxx/include/__algorithm/pstl_stable_sort.h b/libcxx/include/__algorithm/pstl_stable_sort.h index c9d375535fc450..8ea0bb3f9a8d59 100644 --- a/libcxx/include/__algorithm/pstl_stable_sort.h +++ b/libcxx/include/__algorithm/pstl_stable_sort.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -53,4 +56,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_STABLE_SORT_H diff --git a/libcxx/include/__algorithm/pstl_transform.h b/libcxx/include/__algorithm/pstl_transform.h index aad59d1f30e6b9..f95938782fc3bd 100644 --- a/libcxx/include/__algorithm/pstl_transform.h +++ b/libcxx/include/__algorithm/pstl_transform.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -112,4 +115,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_PSTL_TRANSFORM_H diff --git a/libcxx/include/__algorithm/ranges_all_of.h b/libcxx/include/__algorithm/ranges_all_of.h index 39a2ae4de01e99..8976541d590cad 100644 --- a/libcxx/include/__algorithm/ranges_all_of.h +++ b/libcxx/include/__algorithm/ranges_all_of.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_ALL_OF_H diff --git a/libcxx/include/__algorithm/ranges_any_of.h b/libcxx/include/__algorithm/ranges_any_of.h index 2ca8531102eac6..7c775f5f64dec0 100644 --- a/libcxx/include/__algorithm/ranges_any_of.h +++ b/libcxx/include/__algorithm/ranges_any_of.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_ANY_OF_H diff --git a/libcxx/include/__algorithm/ranges_binary_search.h b/libcxx/include/__algorithm/ranges_binary_search.h index 22008e0f1bc8f6..f3b7842d5cccd4 100644 --- a/libcxx/include/__algorithm/ranges_binary_search.h +++ b/libcxx/include/__algorithm/ranges_binary_search.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -65,4 +68,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_BINARY_SEARCH_H diff --git a/libcxx/include/__algorithm/ranges_clamp.h b/libcxx/include/__algorithm/ranges_clamp.h index a1185e7278f0ed..f5ef5fd7f26ec8 100644 --- a/libcxx/include/__algorithm/ranges_clamp.h +++ b/libcxx/include/__algorithm/ranges_clamp.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -58,4 +61,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_CLAMP_H diff --git a/libcxx/include/__algorithm/ranges_contains.h b/libcxx/include/__algorithm/ranges_contains.h index f92fcec587d858..00d0e54019887c 100644 --- a/libcxx/include/__algorithm/ranges_contains.h +++ b/libcxx/include/__algorithm/ranges_contains.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 23 _LIBCPP_BEGIN_NAMESPACE_STD @@ -58,4 +61,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 23 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_CONTAINS_H diff --git a/libcxx/include/__algorithm/ranges_copy.h b/libcxx/include/__algorithm/ranges_copy.h index 1c87f074e7cab9..e1d6d32f05f7e6 100644 --- a/libcxx/include/__algorithm/ranges_copy.h +++ b/libcxx/include/__algorithm/ranges_copy.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -63,4 +66,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_COPY_H diff --git a/libcxx/include/__algorithm/ranges_copy_backward.h b/libcxx/include/__algorithm/ranges_copy_backward.h index 865e944d4384dd..93e326042503fd 100644 --- a/libcxx/include/__algorithm/ranges_copy_backward.h +++ b/libcxx/include/__algorithm/ranges_copy_backward.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -61,4 +64,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_COPY_BACKWARD_H diff --git a/libcxx/include/__algorithm/ranges_copy_if.h b/libcxx/include/__algorithm/ranges_copy_if.h index b77dbd37fcee3a..4b41d2154e7f83 100644 --- a/libcxx/include/__algorithm/ranges_copy_if.h +++ b/libcxx/include/__algorithm/ranges_copy_if.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -79,4 +82,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_COPY_IF_H diff --git a/libcxx/include/__algorithm/ranges_copy_n.h b/libcxx/include/__algorithm/ranges_copy_n.h index 99e8eee14d0f83..4353fa99278c8b 100644 --- a/libcxx/include/__algorithm/ranges_copy_n.h +++ b/libcxx/include/__algorithm/ranges_copy_n.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -73,4 +76,6 @@ inline constexpr auto copy_n = __copy_n::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_COPY_N_H diff --git a/libcxx/include/__algorithm/ranges_count.h b/libcxx/include/__algorithm/ranges_count.h index 4c8f1b2cbea7e4..a8965c1b961f33 100644 --- a/libcxx/include/__algorithm/ranges_count.h +++ b/libcxx/include/__algorithm/ranges_count.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -58,4 +61,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_COUNT_H diff --git a/libcxx/include/__algorithm/ranges_count_if.h b/libcxx/include/__algorithm/ranges_count_if.h index 92f37d049e0c4d..71b942dd5322b7 100644 --- a/libcxx/include/__algorithm/ranges_count_if.h +++ b/libcxx/include/__algorithm/ranges_count_if.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -71,4 +74,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_COUNT_IF_H diff --git a/libcxx/include/__algorithm/ranges_ends_with.h b/libcxx/include/__algorithm/ranges_ends_with.h index 2afb74bff0f152..c2a3cae9f3b16a 100644 --- a/libcxx/include/__algorithm/ranges_ends_with.h +++ b/libcxx/include/__algorithm/ranges_ends_with.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 23 _LIBCPP_BEGIN_NAMESPACE_STD @@ -193,4 +196,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 23 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_ENDS_WITH_H diff --git a/libcxx/include/__algorithm/ranges_equal.h b/libcxx/include/__algorithm/ranges_equal.h index 4cb1f7df1952e5..31c7ee261da61f 100644 --- a/libcxx/include/__algorithm/ranges_equal.h +++ b/libcxx/include/__algorithm/ranges_equal.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -101,4 +104,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_EQUAL_H diff --git a/libcxx/include/__algorithm/ranges_equal_range.h b/libcxx/include/__algorithm/ranges_equal_range.h index 1ff8856ca03f1e..4c1c3834ba9f9f 100644 --- a/libcxx/include/__algorithm/ranges_equal_range.h +++ b/libcxx/include/__algorithm/ranges_equal_range.h @@ -30,6 +30,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -72,4 +75,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_EQUAL_RANGE_H diff --git a/libcxx/include/__algorithm/ranges_fill.h b/libcxx/include/__algorithm/ranges_fill.h index 88a892f5c27865..7a177d85e9f07f 100644 --- a/libcxx/include/__algorithm/ranges_fill.h +++ b/libcxx/include/__algorithm/ranges_fill.h @@ -20,6 +20,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -54,4 +57,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FILL_H diff --git a/libcxx/include/__algorithm/ranges_fill_n.h b/libcxx/include/__algorithm/ranges_fill_n.h index dbd8ec27aef9f5..a6e988c0089ce4 100644 --- a/libcxx/include/__algorithm/ranges_fill_n.h +++ b/libcxx/include/__algorithm/ranges_fill_n.h @@ -17,6 +17,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -45,4 +48,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FILL_N_H diff --git a/libcxx/include/__algorithm/ranges_find.h b/libcxx/include/__algorithm/ranges_find.h index de870e381184c6..7459fad717a5d6 100644 --- a/libcxx/include/__algorithm/ranges_find.h +++ b/libcxx/include/__algorithm/ranges_find.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -72,4 +75,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FIND_H diff --git a/libcxx/include/__algorithm/ranges_find_end.h b/libcxx/include/__algorithm/ranges_find_end.h index 2c57ad424bfdea..0bda4f3e1cea9e 100644 --- a/libcxx/include/__algorithm/ranges_find_end.h +++ b/libcxx/include/__algorithm/ranges_find_end.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -95,4 +98,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FIND_END_H diff --git a/libcxx/include/__algorithm/ranges_find_first_of.h b/libcxx/include/__algorithm/ranges_find_first_of.h index ec6d52c63250b5..63a7b8335faaf5 100644 --- a/libcxx/include/__algorithm/ranges_find_first_of.h +++ b/libcxx/include/__algorithm/ranges_find_first_of.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -98,4 +101,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FIND_FIRST_OF_H diff --git a/libcxx/include/__algorithm/ranges_find_if.h b/libcxx/include/__algorithm/ranges_find_if.h index af54a5007ee259..52ae55ce96c366 100644 --- a/libcxx/include/__algorithm/ranges_find_if.h +++ b/libcxx/include/__algorithm/ranges_find_if.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -67,4 +70,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FIND_IF_H diff --git a/libcxx/include/__algorithm/ranges_find_if_not.h b/libcxx/include/__algorithm/ranges_find_if_not.h index a18bea43165e0d..60c6796cbbfcc7 100644 --- a/libcxx/include/__algorithm/ranges_find_if_not.h +++ b/libcxx/include/__algorithm/ranges_find_if_not.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -61,4 +64,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FIND_IF_NOT_H diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h index 7878ed26709fb6..225dc774c8764a 100644 --- a/libcxx/include/__algorithm/ranges_for_each.h +++ b/libcxx/include/__algorithm/ranges_for_each.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h index 53ccb9a6035a48..d1fdca34cc5a19 100644 --- a/libcxx/include/__algorithm/ranges_for_each_n.h +++ b/libcxx/include/__algorithm/ranges_for_each_n.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -56,4 +59,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H diff --git a/libcxx/include/__algorithm/ranges_generate.h b/libcxx/include/__algorithm/ranges_generate.h index 3ff1e13c422090..e6467198e6ba2f 100644 --- a/libcxx/include/__algorithm/ranges_generate.h +++ b/libcxx/include/__algorithm/ranges_generate.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -65,4 +68,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_GENERATE_H diff --git a/libcxx/include/__algorithm/ranges_generate_n.h b/libcxx/include/__algorithm/ranges_generate_n.h index c025c621a191c2..cd5fd7483ab2c6 100644 --- a/libcxx/include/__algorithm/ranges_generate_n.h +++ b/libcxx/include/__algorithm/ranges_generate_n.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -57,4 +60,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H diff --git a/libcxx/include/__algorithm/ranges_includes.h b/libcxx/include/__algorithm/ranges_includes.h index aa35080c8cfd4b..0bc4c043bd1881 100644 --- a/libcxx/include/__algorithm/ranges_includes.h +++ b/libcxx/include/__algorithm/ranges_includes.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -90,4 +93,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_INCLUDES_H diff --git a/libcxx/include/__algorithm/ranges_inplace_merge.h b/libcxx/include/__algorithm/ranges_inplace_merge.h index 86001b003d5ca1..d94c0ad4656776 100644 --- a/libcxx/include/__algorithm/ranges_inplace_merge.h +++ b/libcxx/include/__algorithm/ranges_inplace_merge.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_INPLACE_MERGE_H diff --git a/libcxx/include/__algorithm/ranges_is_heap.h b/libcxx/include/__algorithm/ranges_is_heap.h index f298c347b747a0..122368c90d924d 100644 --- a/libcxx/include/__algorithm/ranges_is_heap.h +++ b/libcxx/include/__algorithm/ranges_is_heap.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_IS_HEAP_H diff --git a/libcxx/include/__algorithm/ranges_is_heap_until.h b/libcxx/include/__algorithm/ranges_is_heap_until.h index 73f13fb50440ec..b2705d37a6d345 100644 --- a/libcxx/include/__algorithm/ranges_is_heap_until.h +++ b/libcxx/include/__algorithm/ranges_is_heap_until.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_IS_HEAP_UNTIL_H diff --git a/libcxx/include/__algorithm/ranges_is_partitioned.h b/libcxx/include/__algorithm/ranges_is_partitioned.h index 76db870efc7073..c6a585c9f51070 100644 --- a/libcxx/include/__algorithm/ranges_is_partitioned.h +++ b/libcxx/include/__algorithm/ranges_is_partitioned.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -78,4 +81,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_IS_PARTITIONED_H diff --git a/libcxx/include/__algorithm/ranges_is_permutation.h b/libcxx/include/__algorithm/ranges_is_permutation.h index 2b99839bc66fa7..e0423d722b5b98 100644 --- a/libcxx/include/__algorithm/ranges_is_permutation.h +++ b/libcxx/include/__algorithm/ranges_is_permutation.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -99,4 +102,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_IS_PERMUTATION_H diff --git a/libcxx/include/__algorithm/ranges_is_sorted.h b/libcxx/include/__algorithm/ranges_is_sorted.h index 3eb2c768d66a20..d71035d5aa1d01 100644 --- a/libcxx/include/__algorithm/ranges_is_sorted.h +++ b/libcxx/include/__algorithm/ranges_is_sorted.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -59,4 +62,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP__ALGORITHM_RANGES_IS_SORTED_H diff --git a/libcxx/include/__algorithm/ranges_is_sorted_until.h b/libcxx/include/__algorithm/ranges_is_sorted_until.h index 19e9875d2757d4..dcfb6a4e1813bd 100644 --- a/libcxx/include/__algorithm/ranges_is_sorted_until.h +++ b/libcxx/include/__algorithm/ranges_is_sorted_until.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP__ALGORITHM_RANGES_IS_SORTED_UNTIL_H diff --git a/libcxx/include/__algorithm/ranges_iterator_concept.h b/libcxx/include/__algorithm/ranges_iterator_concept.h index 9a92030403361b..2af891d3af005a 100644 --- a/libcxx/include/__algorithm/ranges_iterator_concept.h +++ b/libcxx/include/__algorithm/ranges_iterator_concept.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -48,4 +51,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_ITERATOR_CONCEPT_H diff --git a/libcxx/include/__algorithm/ranges_lexicographical_compare.h b/libcxx/include/__algorithm/ranges_lexicographical_compare.h index 5b843dfd7b3139..90e96b5465169b 100644 --- a/libcxx/include/__algorithm/ranges_lexicographical_compare.h +++ b/libcxx/include/__algorithm/ranges_lexicographical_compare.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -98,4 +101,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_LEXICOGRAPHICAL_COMPARE_H diff --git a/libcxx/include/__algorithm/ranges_lower_bound.h b/libcxx/include/__algorithm/ranges_lower_bound.h index 58b3f815b96a45..ab1f80e7ab7705 100644 --- a/libcxx/include/__algorithm/ranges_lower_bound.h +++ b/libcxx/include/__algorithm/ranges_lower_bound.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -65,4 +68,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_LOWER_BOUND_H diff --git a/libcxx/include/__algorithm/ranges_make_heap.h b/libcxx/include/__algorithm/ranges_make_heap.h index f17eabff43d2a1..fe9c024fbf8a83 100644 --- a/libcxx/include/__algorithm/ranges_make_heap.h +++ b/libcxx/include/__algorithm/ranges_make_heap.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MAKE_HEAP_H diff --git a/libcxx/include/__algorithm/ranges_max_element.h b/libcxx/include/__algorithm/ranges_max_element.h index 2ba97042f1f6e0..83adf49b61ad8f 100644 --- a/libcxx/include/__algorithm/ranges_max_element.h +++ b/libcxx/include/__algorithm/ranges_max_element.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -61,4 +64,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MAX_ELEMENT_H diff --git a/libcxx/include/__algorithm/ranges_merge.h b/libcxx/include/__algorithm/ranges_merge.h index 7f49154ec9221f..bdf9a62d90bd24 100644 --- a/libcxx/include/__algorithm/ranges_merge.h +++ b/libcxx/include/__algorithm/ranges_merge.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -130,4 +133,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MERGE_H diff --git a/libcxx/include/__algorithm/ranges_min_element.h b/libcxx/include/__algorithm/ranges_min_element.h index 07826a0e6b817a..4b9cb76da5789c 100644 --- a/libcxx/include/__algorithm/ranges_min_element.h +++ b/libcxx/include/__algorithm/ranges_min_element.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MIN_ELEMENT_H diff --git a/libcxx/include/__algorithm/ranges_minmax_element.h b/libcxx/include/__algorithm/ranges_minmax_element.h index a52319f6b5d3fd..5132856ebcd5ca 100644 --- a/libcxx/include/__algorithm/ranges_minmax_element.h +++ b/libcxx/include/__algorithm/ranges_minmax_element.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -70,4 +73,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MINMAX_H diff --git a/libcxx/include/__algorithm/ranges_mismatch.h b/libcxx/include/__algorithm/ranges_mismatch.h index db9bfc8e87db67..037af39126230a 100644 --- a/libcxx/include/__algorithm/ranges_mismatch.h +++ b/libcxx/include/__algorithm/ranges_mismatch.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -86,4 +89,6 @@ constexpr inline auto mismatch = __mismatch::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MISMATCH_H diff --git a/libcxx/include/__algorithm/ranges_move.h b/libcxx/include/__algorithm/ranges_move.h index 8bd2409f891c05..be869f36c97304 100644 --- a/libcxx/include/__algorithm/ranges_move.h +++ b/libcxx/include/__algorithm/ranges_move.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MOVE_H diff --git a/libcxx/include/__algorithm/ranges_move_backward.h b/libcxx/include/__algorithm/ranges_move_backward.h index ee390a40e489a4..6d4071a33b8125 100644 --- a/libcxx/include/__algorithm/ranges_move_backward.h +++ b/libcxx/include/__algorithm/ranges_move_backward.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -68,4 +71,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_MOVE_BACKWARD_H diff --git a/libcxx/include/__algorithm/ranges_next_permutation.h b/libcxx/include/__algorithm/ranges_next_permutation.h index 9ebab3ea7c13bd..18535e0a6254a1 100644 --- a/libcxx/include/__algorithm/ranges_next_permutation.h +++ b/libcxx/include/__algorithm/ranges_next_permutation.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -70,4 +73,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_NEXT_PERMUTATION_H diff --git a/libcxx/include/__algorithm/ranges_none_of.h b/libcxx/include/__algorithm/ranges_none_of.h index b0d363895e000b..59bd87997d448f 100644 --- a/libcxx/include/__algorithm/ranges_none_of.h +++ b/libcxx/include/__algorithm/ranges_none_of.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -67,4 +70,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_NONE_OF_H diff --git a/libcxx/include/__algorithm/ranges_nth_element.h b/libcxx/include/__algorithm/ranges_nth_element.h index 7abdbd0889e0cb..90ade9efe10da6 100644 --- a/libcxx/include/__algorithm/ranges_nth_element.h +++ b/libcxx/include/__algorithm/ranges_nth_element.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_NTH_ELEMENT_H diff --git a/libcxx/include/__algorithm/ranges_partial_sort.h b/libcxx/include/__algorithm/ranges_partial_sort.h index 9ec8882097d784..c67247d2e0a77e 100644 --- a/libcxx/include/__algorithm/ranges_partial_sort.h +++ b/libcxx/include/__algorithm/ranges_partial_sort.h @@ -33,6 +33,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_H diff --git a/libcxx/include/__algorithm/ranges_partial_sort_copy.h b/libcxx/include/__algorithm/ranges_partial_sort_copy.h index eba7d9ac416576..b3bdeb78fb6f65 100644 --- a/libcxx/include/__algorithm/ranges_partial_sort_copy.h +++ b/libcxx/include/__algorithm/ranges_partial_sort_copy.h @@ -30,6 +30,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -106,4 +109,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_COPY_H diff --git a/libcxx/include/__algorithm/ranges_partition.h b/libcxx/include/__algorithm/ranges_partition.h index 89d192b51fd329..a67ac4c967570f 100644 --- a/libcxx/include/__algorithm/ranges_partition.h +++ b/libcxx/include/__algorithm/ranges_partition.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -80,4 +83,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_PARTITION_H diff --git a/libcxx/include/__algorithm/ranges_partition_copy.h b/libcxx/include/__algorithm/ranges_partition_copy.h index 6a16b02db3e554..d60c865dd2a8a3 100644 --- a/libcxx/include/__algorithm/ranges_partition_copy.h +++ b/libcxx/include/__algorithm/ranges_partition_copy.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -102,4 +105,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_PARTITION_COPY_H diff --git a/libcxx/include/__algorithm/ranges_partition_point.h b/libcxx/include/__algorithm/ranges_partition_point.h index 6fc20e7d00e9f6..c5b11b5fed192a 100644 --- a/libcxx/include/__algorithm/ranges_partition_point.h +++ b/libcxx/include/__algorithm/ranges_partition_point.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -85,4 +88,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_PARTITION_POINT_H diff --git a/libcxx/include/__algorithm/ranges_pop_heap.h b/libcxx/include/__algorithm/ranges_pop_heap.h index 364cfe94b161e3..01f92c0f228887 100644 --- a/libcxx/include/__algorithm/ranges_pop_heap.h +++ b/libcxx/include/__algorithm/ranges_pop_heap.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -78,4 +81,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_POP_HEAP_H diff --git a/libcxx/include/__algorithm/ranges_prev_permutation.h b/libcxx/include/__algorithm/ranges_prev_permutation.h index ae7a68cce5fdc6..225cee9b75ec6b 100644 --- a/libcxx/include/__algorithm/ranges_prev_permutation.h +++ b/libcxx/include/__algorithm/ranges_prev_permutation.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -70,4 +73,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_PREV_PERMUTATION_H diff --git a/libcxx/include/__algorithm/ranges_push_heap.h b/libcxx/include/__algorithm/ranges_push_heap.h index 1ed9c953f54c35..9d187af38c5319 100644 --- a/libcxx/include/__algorithm/ranges_push_heap.h +++ b/libcxx/include/__algorithm/ranges_push_heap.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_PUSH_HEAP_H diff --git a/libcxx/include/__algorithm/ranges_remove.h b/libcxx/include/__algorithm/ranges_remove.h index e27c4bdd733d81..315bed8fba775b 100644 --- a/libcxx/include/__algorithm/ranges_remove.h +++ b/libcxx/include/__algorithm/ranges_remove.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -60,4 +63,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_H diff --git a/libcxx/include/__algorithm/ranges_remove_copy.h b/libcxx/include/__algorithm/ranges_remove_copy.h index 5158a78e481405..84529eceac68c5 100644 --- a/libcxx/include/__algorithm/ranges_remove_copy.h +++ b/libcxx/include/__algorithm/ranges_remove_copy.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_H diff --git a/libcxx/include/__algorithm/ranges_remove_copy_if.h b/libcxx/include/__algorithm/ranges_remove_copy_if.h index c07b4813d7d0a9..56fe017533120b 100644 --- a/libcxx/include/__algorithm/ranges_remove_copy_if.h +++ b/libcxx/include/__algorithm/ranges_remove_copy_if.h @@ -29,6 +29,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -87,4 +90,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_IF_H diff --git a/libcxx/include/__algorithm/ranges_remove_if.h b/libcxx/include/__algorithm/ranges_remove_if.h index 4b7aa2d2be78a8..943dbdd73807e6 100644 --- a/libcxx/include/__algorithm/ranges_remove_if.h +++ b/libcxx/include/__algorithm/ranges_remove_if.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -81,4 +84,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_IF_H diff --git a/libcxx/include/__algorithm/ranges_replace.h b/libcxx/include/__algorithm/ranges_replace.h index b66a41aa8d0d77..2b88dc032972f4 100644 --- a/libcxx/include/__algorithm/ranges_replace.h +++ b/libcxx/include/__algorithm/ranges_replace.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -60,4 +63,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_H diff --git a/libcxx/include/__algorithm/ranges_replace_copy.h b/libcxx/include/__algorithm/ranges_replace_copy.h index a7627024812fd2..633f993e5c9484 100644 --- a/libcxx/include/__algorithm/ranges_replace_copy.h +++ b/libcxx/include/__algorithm/ranges_replace_copy.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -85,4 +88,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_H diff --git a/libcxx/include/__algorithm/ranges_replace_copy_if.h b/libcxx/include/__algorithm/ranges_replace_copy_if.h index 10ed1fda6c5c86..e065c3ac0acc90 100644 --- a/libcxx/include/__algorithm/ranges_replace_copy_if.h +++ b/libcxx/include/__algorithm/ranges_replace_copy_if.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -90,4 +93,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_IF_H diff --git a/libcxx/include/__algorithm/ranges_replace_if.h b/libcxx/include/__algorithm/ranges_replace_if.h index 519fa32029ac67..6445f42aea1908 100644 --- a/libcxx/include/__algorithm/ranges_replace_if.h +++ b/libcxx/include/__algorithm/ranges_replace_if.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_IF_H diff --git a/libcxx/include/__algorithm/ranges_reverse_copy.h b/libcxx/include/__algorithm/ranges_reverse_copy.h index 35b9edba0bfb26..60043787a71705 100644 --- a/libcxx/include/__algorithm/ranges_reverse_copy.h +++ b/libcxx/include/__algorithm/ranges_reverse_copy.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -62,4 +65,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_REVERSE_COPY_H diff --git a/libcxx/include/__algorithm/ranges_rotate.h b/libcxx/include/__algorithm/ranges_rotate.h index ebed9bbd542665..8d33a6f0799bf7 100644 --- a/libcxx/include/__algorithm/ranges_rotate.h +++ b/libcxx/include/__algorithm/ranges_rotate.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -63,4 +66,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_ROTATE_H diff --git a/libcxx/include/__algorithm/ranges_rotate_copy.h b/libcxx/include/__algorithm/ranges_rotate_copy.h index ab76c0944c4771..26fe110b538963 100644 --- a/libcxx/include/__algorithm/ranges_rotate_copy.h +++ b/libcxx/include/__algorithm/ranges_rotate_copy.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -60,4 +63,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_ROTATE_COPY_H diff --git a/libcxx/include/__algorithm/ranges_sample.h b/libcxx/include/__algorithm/ranges_sample.h index d347d82205a89d..e4f60a7b66be2b 100644 --- a/libcxx/include/__algorithm/ranges_sample.h +++ b/libcxx/include/__algorithm/ranges_sample.h @@ -27,6 +27,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SAMPLE_H diff --git a/libcxx/include/__algorithm/ranges_search_n.h b/libcxx/include/__algorithm/ranges_search_n.h index 4e53f30f71f9d6..4c1d73d8e6c340 100644 --- a/libcxx/include/__algorithm/ranges_search_n.h +++ b/libcxx/include/__algorithm/ranges_search_n.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -108,4 +111,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SEARCH_N_H diff --git a/libcxx/include/__algorithm/ranges_set_difference.h b/libcxx/include/__algorithm/ranges_set_difference.h index a9453ed336f515..0841fb4ffd0c06 100644 --- a/libcxx/include/__algorithm/ranges_set_difference.h +++ b/libcxx/include/__algorithm/ranges_set_difference.h @@ -30,6 +30,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -100,4 +103,7 @@ inline constexpr auto set_difference = __set_difference::__fn{}; _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 + +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SET_DIFFERENCE_H diff --git a/libcxx/include/__algorithm/ranges_set_intersection.h b/libcxx/include/__algorithm/ranges_set_intersection.h index 4cdcbb75051a1f..9427379745b60f 100644 --- a/libcxx/include/__algorithm/ranges_set_intersection.h +++ b/libcxx/include/__algorithm/ranges_set_intersection.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -105,4 +108,7 @@ inline constexpr auto set_intersection = __set_intersection::__fn{}; _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 + +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SET_INTERSECTION_H diff --git a/libcxx/include/__algorithm/ranges_set_symmetric_difference.h b/libcxx/include/__algorithm/ranges_set_symmetric_difference.h index d8710a1c47b0bb..995eb0999d940a 100644 --- a/libcxx/include/__algorithm/ranges_set_symmetric_difference.h +++ b/libcxx/include/__algorithm/ranges_set_symmetric_difference.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -105,4 +108,7 @@ inline constexpr auto set_symmetric_difference = __set_symmetric_difference::__f _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 + +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SET_SYMMETRIC_DIFFERENCE_H diff --git a/libcxx/include/__algorithm/ranges_set_union.h b/libcxx/include/__algorithm/ranges_set_union.h index c627166fffed33..e870e390cc6659 100644 --- a/libcxx/include/__algorithm/ranges_set_union.h +++ b/libcxx/include/__algorithm/ranges_set_union.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -107,4 +110,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SET_UNION_H diff --git a/libcxx/include/__algorithm/ranges_shuffle.h b/libcxx/include/__algorithm/ranges_shuffle.h index fca420058dec08..ab98ea22caabec 100644 --- a/libcxx/include/__algorithm/ranges_shuffle.h +++ b/libcxx/include/__algorithm/ranges_shuffle.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -64,4 +67,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SHUFFLE_H diff --git a/libcxx/include/__algorithm/ranges_sort.h b/libcxx/include/__algorithm/ranges_sort.h index 2ad0e0c233be48..0296c146b3edee 100644 --- a/libcxx/include/__algorithm/ranges_sort.h +++ b/libcxx/include/__algorithm/ranges_sort.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SORT_H diff --git a/libcxx/include/__algorithm/ranges_sort_heap.h b/libcxx/include/__algorithm/ranges_sort_heap.h index 365c7dba615674..bab30df1708c75 100644 --- a/libcxx/include/__algorithm/ranges_sort_heap.h +++ b/libcxx/include/__algorithm/ranges_sort_heap.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SORT_HEAP_H diff --git a/libcxx/include/__algorithm/ranges_stable_partition.h b/libcxx/include/__algorithm/ranges_stable_partition.h index 44937fa5899082..f34027ff772c78 100644 --- a/libcxx/include/__algorithm/ranges_stable_partition.h +++ b/libcxx/include/__algorithm/ranges_stable_partition.h @@ -34,6 +34,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -84,4 +87,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_STABLE_PARTITION_H diff --git a/libcxx/include/__algorithm/ranges_stable_sort.h b/libcxx/include/__algorithm/ranges_stable_sort.h index a4eed3836356d4..93909e253cc0f2 100644 --- a/libcxx/include/__algorithm/ranges_stable_sort.h +++ b/libcxx/include/__algorithm/ranges_stable_sort.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_STABLE_SORT_H diff --git a/libcxx/include/__algorithm/ranges_starts_with.h b/libcxx/include/__algorithm/ranges_starts_with.h index 7da78001d8148d..90e184aa9bccc2 100644 --- a/libcxx/include/__algorithm/ranges_starts_with.h +++ b/libcxx/include/__algorithm/ranges_starts_with.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 23 _LIBCPP_BEGIN_NAMESPACE_STD @@ -87,4 +90,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 23 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_STARTS_WITH_H diff --git a/libcxx/include/__algorithm/ranges_swap_ranges.h b/libcxx/include/__algorithm/ranges_swap_ranges.h index 1d0ebc0d5221e1..b6d9f618395a5e 100644 --- a/libcxx/include/__algorithm/ranges_swap_ranges.h +++ b/libcxx/include/__algorithm/ranges_swap_ranges.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -62,4 +65,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_SWAP_RANGES_H diff --git a/libcxx/include/__algorithm/ranges_transform.h b/libcxx/include/__algorithm/ranges_transform.h index f66a07ac026e5f..7850ec4f846560 100644 --- a/libcxx/include/__algorithm/ranges_transform.h +++ b/libcxx/include/__algorithm/ranges_transform.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -169,4 +172,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_TRANSFORM_H diff --git a/libcxx/include/__algorithm/ranges_unique.h b/libcxx/include/__algorithm/ranges_unique.h index b17e01fc50577e..7340310eb36a90 100644 --- a/libcxx/include/__algorithm/ranges_unique.h +++ b/libcxx/include/__algorithm/ranges_unique.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_UNIQUE_H diff --git a/libcxx/include/__algorithm/ranges_unique_copy.h b/libcxx/include/__algorithm/ranges_unique_copy.h index 7e89f9d97af7f6..61133885ae809d 100644 --- a/libcxx/include/__algorithm/ranges_unique_copy.h +++ b/libcxx/include/__algorithm/ranges_unique_copy.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -112,4 +115,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_RANGES_UNIQUE_COPY_H diff --git a/libcxx/include/__algorithm/remove.h b/libcxx/include/__algorithm/remove.h index 2b9d4ff26ed2a5..1498852c436130 100644 --- a/libcxx/include/__algorithm/remove.h +++ b/libcxx/include/__algorithm/remove.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -38,4 +41,6 @@ remove(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_REMOVE_H diff --git a/libcxx/include/__algorithm/remove_if.h b/libcxx/include/__algorithm/remove_if.h index 6eceddce8d56b4..c77b78023f529f 100644 --- a/libcxx/include/__algorithm/remove_if.h +++ b/libcxx/include/__algorithm/remove_if.h @@ -17,6 +17,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -37,4 +40,6 @@ remove_if(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_REMOVE_IF_H diff --git a/libcxx/include/__algorithm/reverse.h b/libcxx/include/__algorithm/reverse.h index 6bd0aa39328068..4167c9116d96e7 100644 --- a/libcxx/include/__algorithm/reverse.h +++ b/libcxx/include/__algorithm/reverse.h @@ -19,6 +19,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -54,4 +57,6 @@ reverse(_BidirectionalIterator __first, _BidirectionalIterator __last) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_REVERSE_H diff --git a/libcxx/include/__algorithm/rotate.h b/libcxx/include/__algorithm/rotate.h index d8162b1a94b272..9a4d07883e320f 100644 --- a/libcxx/include/__algorithm/rotate.h +++ b/libcxx/include/__algorithm/rotate.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -190,4 +193,6 @@ rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __l _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_ROTATE_H diff --git a/libcxx/include/__algorithm/set_difference.h b/libcxx/include/__algorithm/set_difference.h index a924702ce5f26c..f414bcecb50df1 100644 --- a/libcxx/include/__algorithm/set_difference.h +++ b/libcxx/include/__algorithm/set_difference.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -71,4 +74,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_d _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SET_DIFFERENCE_H diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h index f2603fe1365ac3..73d888d1b03843 100644 --- a/libcxx/include/__algorithm/set_intersection.h +++ b/libcxx/include/__algorithm/set_intersection.h @@ -21,6 +21,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -95,4 +98,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_i _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SET_INTERSECTION_H diff --git a/libcxx/include/__algorithm/set_symmetric_difference.h b/libcxx/include/__algorithm/set_symmetric_difference.h index 832c3979bfd762..db36665a61365c 100644 --- a/libcxx/include/__algorithm/set_symmetric_difference.h +++ b/libcxx/include/__algorithm/set_symmetric_difference.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -101,4 +104,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_symmetri _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SET_SYMMETRIC_DIFFERENCE_H diff --git a/libcxx/include/__algorithm/set_union.h b/libcxx/include/__algorithm/set_union.h index cf48adae03bed3..a79c50fd3cf2f0 100644 --- a/libcxx/include/__algorithm/set_union.h +++ b/libcxx/include/__algorithm/set_union.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -97,4 +100,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_union( _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SET_UNION_H diff --git a/libcxx/include/__algorithm/shift_left.h b/libcxx/include/__algorithm/shift_left.h index 645c58c2911924..06cd7c5f87644e 100644 --- a/libcxx/include/__algorithm/shift_left.h +++ b/libcxx/include/__algorithm/shift_left.h @@ -17,6 +17,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -51,4 +54,6 @@ shift_left(_ForwardIterator __first, _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SHIFT_LEFT_H diff --git a/libcxx/include/__algorithm/shift_right.h b/libcxx/include/__algorithm/shift_right.h index 73ef98bd39deda..01853057fc4788 100644 --- a/libcxx/include/__algorithm/shift_right.h +++ b/libcxx/include/__algorithm/shift_right.h @@ -20,6 +20,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -97,4 +100,6 @@ shift_right(_ForwardIterator __first, _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SHIFT_RIGHT_H diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h index 451133a2d193dd..8a5e0211cdf4c1 100644 --- a/libcxx/include/__algorithm/sort.h +++ b/libcxx/include/__algorithm/sort.h @@ -39,6 +39,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // stable, 2-3 compares, 0-2 swaps @@ -1009,4 +1012,6 @@ sort(_RandomAccessIterator __first, _RandomAccessIterator __last) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SORT_H diff --git a/libcxx/include/__algorithm/sort_heap.h b/libcxx/include/__algorithm/sort_heap.h index 0a6d992d0090e3..060fc33c3c6e93 100644 --- a/libcxx/include/__algorithm/sort_heap.h +++ b/libcxx/include/__algorithm/sort_heap.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -55,4 +58,6 @@ sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SORT_HEAP_H diff --git a/libcxx/include/__algorithm/stable_partition.h b/libcxx/include/__algorithm/stable_partition.h index 8762abcf18e150..8bb1eaf2d22495 100644 --- a/libcxx/include/__algorithm/stable_partition.h +++ b/libcxx/include/__algorithm/stable_partition.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -299,4 +302,6 @@ stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate _ _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_STABLE_PARTITION_H diff --git a/libcxx/include/__algorithm/stable_sort.h b/libcxx/include/__algorithm/stable_sort.h index ffc6e4ce281888..9be192bd65a6ef 100644 --- a/libcxx/include/__algorithm/stable_sort.h +++ b/libcxx/include/__algorithm/stable_sort.h @@ -29,6 +29,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -265,4 +268,6 @@ inline _LIBCPP_HIDE_FROM_ABI void stable_sort(_RandomAccessIterator __first, _Ra _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_STABLE_SORT_H diff --git a/libcxx/include/__algorithm/swap_ranges.h b/libcxx/include/__algorithm/swap_ranges.h index 7fab5c49a656fe..54b453b72360e0 100644 --- a/libcxx/include/__algorithm/swap_ranges.h +++ b/libcxx/include/__algorithm/swap_ranges.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // 2+2 iterators: the shorter size will be used. @@ -54,4 +57,6 @@ swap_ranges(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardItera _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SWAP_RANGES_H diff --git a/libcxx/include/__algorithm/unique.h b/libcxx/include/__algorithm/unique.h index 1717a00c8a9346..056373d06fe44c 100644 --- a/libcxx/include/__algorithm/unique.h +++ b/libcxx/include/__algorithm/unique.h @@ -21,6 +21,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // unique @@ -56,4 +59,6 @@ unique(_ForwardIterator __first, _ForwardIterator __last) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_UNIQUE_H diff --git a/libcxx/include/__algorithm/unique_copy.h b/libcxx/include/__algorithm/unique_copy.h index 81fcd50f011d5d..16ce80cab32f0d 100644 --- a/libcxx/include/__algorithm/unique_copy.h +++ b/libcxx/include/__algorithm/unique_copy.h @@ -23,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD namespace __unique_copy_tags { @@ -119,4 +122,6 @@ unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __res _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_UNIQUE_COPY_H diff --git a/libcxx/include/__algorithm/unwrap_iter.h b/libcxx/include/__algorithm/unwrap_iter.h index a298a2b271056c..50d815c9708849 100644 --- a/libcxx/include/__algorithm/unwrap_iter.h +++ b/libcxx/include/__algorithm/unwrap_iter.h @@ -80,6 +80,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _OrigIter __rewrap_iter(_OrigIter __orig _LIBCPP_END_NAMESPACE_STD -_LIBCPP_PUSH_MACROS +_LIBCPP_POP_MACROS #endif // _LIBCPP___ALGORITHM_UNWRAP_ITER_H diff --git a/libcxx/include/__algorithm/unwrap_range.h b/libcxx/include/__algorithm/unwrap_range.h index 053fd550b302ee..2d4b9bb5545ad3 100644 --- a/libcxx/include/__algorithm/unwrap_range.h +++ b/libcxx/include/__algorithm/unwrap_range.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // __unwrap_range and __rewrap_range are used to unwrap ranges which may have different iterator and sentinel types. @@ -91,4 +94,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap_range(_Iter __orig_iter, _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_UNWRAP_RANGE_H diff --git a/libcxx/include/__availability b/libcxx/include/__availability index c5069a027750ec..b8b2da9bb12265 100644 --- a/libcxx/include/__availability +++ b/libcxx/include/__availability @@ -72,11 +72,10 @@ # endif #endif -// Availability markup is disabled when building the library, or when the compiler +// Availability markup is disabled when building the library, or when a non-Clang +// compiler is used because only Clang supports the necessary attributes. // doesn't support the proper attributes. -#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) || \ - !__has_feature(attribute_availability_with_strict) || !__has_feature(attribute_availability_in_templates) || \ - !__has_extension(pragma_clang_attribute_external_declaration) +#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) || !defined(_LIBCPP_COMPILER_CLANG_BASED) # if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS) # define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS # endif diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference index 9032b8f0180937..3a5339b72ddc31 100644 --- a/libcxx/include/__bit_reference +++ b/libcxx/include/__bit_reference @@ -173,7 +173,7 @@ private: // fill_n -template +template _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { using _It = __bit_iterator<_Cp, false>; @@ -185,7 +185,7 @@ __fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); __storage_type __dn = std::min(__clz_f, __n); __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - if (_FillValue) + if (_FillVal) *__first.__seg_ |= __m; else *__first.__seg_ &= ~__m; @@ -194,13 +194,13 @@ __fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { } // do middle whole words __storage_type __nw = __n / __bits_per_word; - std::fill_n(std::__to_address(__first.__seg_), __nw, _FillValue ? static_cast<__storage_type>(-1) : 0); + std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0); __n -= __nw * __bits_per_word; // do last partial word if (__n > 0) { __first.__seg_ += __nw; __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - if (_FillValue) + if (_FillVal) *__first.__seg_ |= __m; else *__first.__seg_ &= ~__m; @@ -1007,7 +1007,7 @@ private: friend class __bit_iterator<_Cp, true>; template friend struct __bit_array; - template + template _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); template diff --git a/libcxx/include/__config b/libcxx/include/__config index 9557e8e8cf97f2..8b2eaf69d17042 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -62,7 +62,7 @@ // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM. // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is // defined to XXYYZZ. -# define _LIBCPP_VERSION 180000 +# define _LIBCPP_VERSION 180100 # define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y # define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y) @@ -174,11 +174,6 @@ // The implementation moved to the header, but we still export the symbols from // the dylib for backwards compatibility. # define _LIBCPP_ABI_DO_NOT_EXPORT_TO_CHARS_BASE_10 -// Save memory by providing the allocator more freedom to allocate the most -// efficient size class by dropping the alignment requirements for std::string's -// pointer from 16 to 8. This changes the output of std::string::max_size, -// which makes it ABI breaking -# define _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT # elif _LIBCPP_ABI_VERSION == 1 # if !(defined(_LIBCPP_OBJECT_FORMAT_COFF) || defined(_LIBCPP_OBJECT_FORMAT_XCOFF)) // Enable compiling copies of now inline methods into the dylib to support @@ -1280,8 +1275,8 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c # endif // _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES // clang-format off -# define _LIBCPP_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")") _Pragma("push_macro(\"refresh()\")") _Pragma("push_macro(\"move(int, int)\")") _Pragma("push_macro(\"erase()\")") -# define _LIBCPP_POP_MACROS _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")") _Pragma("pop_macro(\"refresh()\")") _Pragma("pop_macro(\"move(int, int)\")") _Pragma("pop_macro(\"erase()\")") +# define _LIBCPP_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")") _Pragma("push_macro(\"refresh\")") _Pragma("push_macro(\"move\")") _Pragma("push_macro(\"erase\")") +# define _LIBCPP_POP_MACROS _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")") _Pragma("pop_macro(\"refresh\")") _Pragma("pop_macro(\"move\")") _Pragma("pop_macro(\"erase\")") // clang-format on # ifndef _LIBCPP_NO_AUTO_LINK diff --git a/libcxx/include/__filesystem/directory_iterator.h b/libcxx/include/__filesystem/directory_iterator.h index 5287a4d8b055fd..a5aa5ff5432dab 100644 --- a/libcxx/include/__filesystem/directory_iterator.h +++ b/libcxx/include/__filesystem/directory_iterator.h @@ -29,6 +29,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM) _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM @@ -144,4 +147,6 @@ _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY inline constexpr bool #endif // _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM) +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FILESYSTEM_DIRECTORY_ITERATOR_H diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h index 1ff992dd64e6d7..8c7d426f7a6f4f 100644 --- a/libcxx/include/__filesystem/path.h +++ b/libcxx/include/__filesystem/path.h @@ -36,6 +36,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM @@ -925,4 +928,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FILESYSTEM_PATH_H diff --git a/libcxx/include/__filesystem/recursive_directory_iterator.h b/libcxx/include/__filesystem/recursive_directory_iterator.h index 7519cc2f2932f2..a8af4f73b14a5f 100644 --- a/libcxx/include/__filesystem/recursive_directory_iterator.h +++ b/libcxx/include/__filesystem/recursive_directory_iterator.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM) _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM @@ -157,4 +160,6 @@ _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY inline constexpr bool #endif // _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM) +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h index 10fca15d5a7a94..34ed9bcd6d63c9 100644 --- a/libcxx/include/__format/format_arg.h +++ b/libcxx/include/__format/format_arg.h @@ -30,6 +30,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -289,4 +292,6 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) visit_format_arg(_Visitor&& __vis, basic_fo _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FORMAT_FORMAT_ARG_H diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h index 5b252b81f691bc..edb0348b34f363 100644 --- a/libcxx/include/__format/format_context.h +++ b/libcxx/include/__format/format_context.h @@ -35,6 +35,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -205,4 +208,6 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(basic_format_context); _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FORMAT_FORMAT_CONTEXT_H diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h index 015bff70f51d97..cf833ad2055441 100644 --- a/libcxx/include/__format/format_functions.h +++ b/libcxx/include/__format/format_functions.h @@ -48,6 +48,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -674,4 +677,6 @@ formatted_size(locale __loc, wformat_string<_Args...> __fmt, _Args&&... __args) _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FORMAT_FORMAT_FUNCTIONS diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h index 6802a8b7bd4ca3..46a090a787ae28 100644 --- a/libcxx/include/__format/formatter_floating_point.h +++ b/libcxx/include/__format/formatter_floating_point.h @@ -689,7 +689,7 @@ __format_floating_point(_Tp __value, _FormatContext& __ctx, __format_spec::__par // Let P equal the precision if nonzero, 6 if the precision is not // specified, or 1 if the precision is 0. Then, if a conversion with // style E would have an exponent of X: - int __p = std::max(1, (__specs.__has_precision() ? __specs.__precision_ : 6)); + int __p = std::max(1, (__specs.__has_precision() ? __specs.__precision_ : 6)); if (__result.__exponent == __result.__last) // if P > X >= -4, the conversion is with style f or F and precision P - 1 - X. // By including the radix point it calculates P - (1 + X) diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h index eebe880d69ef59..d5038eb158b0ad 100644 --- a/libcxx/include/__format/formatter_output.h +++ b/libcxx/include/__format/formatter_output.h @@ -35,6 +35,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -325,4 +328,6 @@ _LIBCPP_HIDE_FROM_ABI int __truncate(basic_string_view<_CharT>& __str, int __pre _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FORMAT_FORMATTER_OUTPUT_H diff --git a/libcxx/include/__format/write_escaped.h b/libcxx/include/__format/write_escaped.h index ec1283a173e94c..43a074dd8d7002 100644 --- a/libcxx/include/__format/write_escaped.h +++ b/libcxx/include/__format/write_escaped.h @@ -30,6 +30,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD namespace __formatter { @@ -218,4 +221,6 @@ __format_escaped_string(basic_string_view<_CharT> __values, _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FORMAT_WRITE_ESCAPED_H diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h index 6505bb5871739d..416c26a0c73f2e 100644 --- a/libcxx/include/__functional/function.h +++ b/libcxx/include/__functional/function.h @@ -45,6 +45,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #ifndef _LIBCPP_CXX03_LANG _LIBCPP_BEGIN_NAMESPACE_STD @@ -1032,4 +1035,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_CXX03_LANG +_LIBCPP_POP_MACROS + #endif // _LIBCPP___FUNCTIONAL_FUNCTION_H diff --git a/libcxx/include/__iterator/cpp17_iterator_concepts.h b/libcxx/include/__iterator/cpp17_iterator_concepts.h index c4f49fe7422710..d1ad2b4e284808 100644 --- a/libcxx/include/__iterator/cpp17_iterator_concepts.h +++ b/libcxx/include/__iterator/cpp17_iterator_concepts.h @@ -29,6 +29,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -182,4 +185,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ITERATOR_CPP17_ITERATOR_CONCEPTS_H diff --git a/libcxx/include/__iterator/iterator_with_data.h b/libcxx/include/__iterator/iterator_with_data.h index 06c2fa699c30eb..afdc0a4e12e21c 100644 --- a/libcxx/include/__iterator/iterator_with_data.h +++ b/libcxx/include/__iterator/iterator_with_data.h @@ -24,6 +24,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 _LIBCPP_BEGIN_NAMESPACE_STD @@ -97,4 +100,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ITERATOR_ITERATOR_WITH_DATA_H diff --git a/libcxx/include/__memory/ranges_uninitialized_algorithms.h b/libcxx/include/__memory/ranges_uninitialized_algorithms.h index d836d00820a658..90090055bbbbf9 100644 --- a/libcxx/include/__memory/ranges_uninitialized_algorithms.h +++ b/libcxx/include/__memory/ranges_uninitialized_algorithms.h @@ -31,6 +31,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -317,4 +320,6 @@ inline constexpr auto uninitialized_move_n = __uninitialized_move_n::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___MEMORY_RANGES_UNINITIALIZED_ALGORITHMS_H diff --git a/libcxx/include/__memory/raw_storage_iterator.h b/libcxx/include/__memory/raw_storage_iterator.h index 33790a397c84b6..774878aa1c5e81 100644 --- a/libcxx/include/__memory/raw_storage_iterator.h +++ b/libcxx/include/__memory/raw_storage_iterator.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR) @@ -79,4 +82,6 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 raw_storage_iterator _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___MEMORY_RAW_STORAGE_ITERATOR_H diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h index 9a73d439306d9e..e6de615d76fa7d 100644 --- a/libcxx/include/__memory/shared_ptr.h +++ b/libcxx/include/__memory/shared_ptr.h @@ -61,6 +61,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // NOTE: Relaxed and acq/rel atomics (for increment and decrement respectively) @@ -1662,4 +1665,6 @@ inline _LIBCPP_HIDE_FROM_ABI bool atomic_compare_exchange_weak_explicit( _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___MEMORY_SHARED_PTR_H diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h index 2a4ecf655be287..9aff93a8964863 100644 --- a/libcxx/include/__memory/uninitialized_algorithms.h +++ b/libcxx/include/__memory/uninitialized_algorithms.h @@ -42,6 +42,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD struct __always_false { @@ -648,4 +651,6 @@ __uninitialized_allocator_move_if_noexcept(_Alloc&, _Iter1 __first1, _Iter1 __la _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___MEMORY_UNINITIALIZED_ALGORITHMS_H diff --git a/libcxx/include/__mutex/once_flag.h b/libcxx/include/__mutex/once_flag.h index 5a6f8e09055f75..9d7baecbc70859 100644 --- a/libcxx/include/__mutex/once_flag.h +++ b/libcxx/include/__mutex/once_flag.h @@ -25,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD struct _LIBCPP_TEMPLATE_VIS once_flag; @@ -151,4 +154,6 @@ inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, const _Callable& _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___MUTEX_ONCE_FLAG_H diff --git a/libcxx/include/__numeric/pstl_reduce.h b/libcxx/include/__numeric/pstl_reduce.h index b19972a46db7fa..f9f666c2bb38b8 100644 --- a/libcxx/include/__numeric/pstl_reduce.h +++ b/libcxx/include/__numeric/pstl_reduce.h @@ -20,6 +20,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -101,4 +104,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___NUMERIC_PSTL_REDUCE_H diff --git a/libcxx/include/__numeric/pstl_transform_reduce.h b/libcxx/include/__numeric/pstl_transform_reduce.h index 1127726046665c..2f412d41f7f27a 100644 --- a/libcxx/include/__numeric/pstl_transform_reduce.h +++ b/libcxx/include/__numeric/pstl_transform_reduce.h @@ -22,6 +22,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD @@ -148,4 +151,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 +_LIBCPP_POP_MACROS + #endif // _LIBCPP___NUMERIC_PSTL_TRANSFORM_REDUCE_H diff --git a/libcxx/include/__numeric/reduce.h b/libcxx/include/__numeric/reduce.h index 1aeefce132b2b6..6c205bf581fb95 100644 --- a/libcxx/include/__numeric/reduce.h +++ b/libcxx/include/__numeric/reduce.h @@ -19,6 +19,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 @@ -45,4 +48,6 @@ reduce(_InputIterator __first, _InputIterator __last) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___NUMERIC_REDUCE_H diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h index 50274c6bbd9f3a..0e6f455cf22825 100644 --- a/libcxx/include/__numeric/saturation_arithmetic.h +++ b/libcxx/include/__numeric/saturation_arithmetic.h @@ -19,6 +19,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 26 @@ -107,4 +110,6 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H diff --git a/libcxx/include/__numeric/transform_reduce.h b/libcxx/include/__numeric/transform_reduce.h index 6c0a81e5e4b099..f1150510f0c36f 100644 --- a/libcxx/include/__numeric/transform_reduce.h +++ b/libcxx/include/__numeric/transform_reduce.h @@ -18,6 +18,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 @@ -51,4 +54,6 @@ transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterat _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___NUMERIC_TRANSFORM_REDUCE_H diff --git a/libcxx/include/__ranges/counted.h b/libcxx/include/__ranges/counted.h index 337634895766ba..83d76f8fd21068 100644 --- a/libcxx/include/__ranges/counted.h +++ b/libcxx/include/__ranges/counted.h @@ -29,6 +29,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -82,4 +85,6 @@ inline constexpr auto counted = __counted::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_COUNTED_H diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h index 4e3ef61678f4d7..92f48bd0ecfba3 100644 --- a/libcxx/include/__ranges/drop_while_view.h +++ b/libcxx/include/__ranges/drop_while_view.h @@ -37,6 +37,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -128,4 +131,6 @@ inline constexpr auto drop_while = __drop_while::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_DROP_WHILE_VIEW_H diff --git a/libcxx/include/__ranges/elements_view.h b/libcxx/include/__ranges/elements_view.h index 325e4c9dca6399..989d36fbcaaae5 100644 --- a/libcxx/include/__ranges/elements_view.h +++ b/libcxx/include/__ranges/elements_view.h @@ -43,6 +43,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -410,4 +413,6 @@ inline constexpr auto values = elements<1>; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_ELEMENTS_VIEW_H diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h index 6e6719c14470da..5b938dd4c16e19 100644 --- a/libcxx/include/__ranges/filter_view.h +++ b/libcxx/include/__ranges/filter_view.h @@ -44,6 +44,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -252,4 +255,6 @@ inline constexpr auto filter = __filter::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_FILTER_VIEW_H diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h index c6c9618cfe6c1d..c8314dd848b447 100644 --- a/libcxx/include/__ranges/iota_view.h +++ b/libcxx/include/__ranges/iota_view.h @@ -41,6 +41,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -395,4 +398,6 @@ inline constexpr auto iota = __iota::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_IOTA_VIEW_H diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h index 22473059133f70..9c2c77995539bd 100644 --- a/libcxx/include/__ranges/join_view.h +++ b/libcxx/include/__ranges/join_view.h @@ -41,6 +41,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -415,4 +418,6 @@ struct __segmented_iterator_traits<_JoinViewIterator> { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_JOIN_VIEW_H diff --git a/libcxx/include/__ranges/lazy_split_view.h b/libcxx/include/__ranges/lazy_split_view.h index e96398b14b58aa..6aedfdabffe3a8 100644 --- a/libcxx/include/__ranges/lazy_split_view.h +++ b/libcxx/include/__ranges/lazy_split_view.h @@ -47,6 +47,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -433,4 +436,6 @@ inline constexpr auto lazy_split = __lazy_split_view::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_LAZY_SPLIT_VIEW_H diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h index d9759abe1cba6b..d08f0e0d4e9f74 100644 --- a/libcxx/include/__ranges/repeat_view.h +++ b/libcxx/include/__ranges/repeat_view.h @@ -34,6 +34,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 23 @@ -257,4 +260,6 @@ inline constexpr bool __is_repeat_specialization> = tru _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_REPEAT_VIEW_H diff --git a/libcxx/include/__ranges/reverse_view.h b/libcxx/include/__ranges/reverse_view.h index f7846259810c92..ddbe8908414f9b 100644 --- a/libcxx/include/__ranges/reverse_view.h +++ b/libcxx/include/__ranges/reverse_view.h @@ -33,6 +33,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -196,4 +199,6 @@ inline constexpr auto reverse = __reverse::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_REVERSE_VIEW_H diff --git a/libcxx/include/__ranges/single_view.h b/libcxx/include/__ranges/single_view.h index ead597a9be170d..f91c7c35263676 100644 --- a/libcxx/include/__ranges/single_view.h +++ b/libcxx/include/__ranges/single_view.h @@ -26,6 +26,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -101,4 +104,6 @@ inline constexpr auto single = __single_view::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_SINGLE_VIEW_H diff --git a/libcxx/include/__ranges/split_view.h b/libcxx/include/__ranges/split_view.h index 7f03be3c346a42..98f17be04f628f 100644 --- a/libcxx/include/__ranges/split_view.h +++ b/libcxx/include/__ranges/split_view.h @@ -36,6 +36,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -224,4 +227,6 @@ inline constexpr auto split = __split_view::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_SPLIT_VIEW_H diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h index 46cfe4f70ac834..77ea9f7bb81316 100644 --- a/libcxx/include/__ranges/take_while_view.h +++ b/libcxx/include/__ranges/take_while_view.h @@ -35,6 +35,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -162,4 +165,6 @@ inline constexpr auto take_while = __take_while::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_TAKE_WHILE_VIEW_H diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h index 3c8d825789cbc9..dc3aaa59ed8c3f 100644 --- a/libcxx/include/__ranges/transform_view.h +++ b/libcxx/include/__ranges/transform_view.h @@ -47,6 +47,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 @@ -416,4 +419,6 @@ inline constexpr auto transform = __transform::__fn{}; _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___RANGES_TRANSFORM_VIEW_H diff --git a/libcxx/include/__thread/jthread.h b/libcxx/include/__thread/jthread.h index fc86b13afb1343..2fbc8a36755e96 100644 --- a/libcxx/include/__thread/jthread.h +++ b/libcxx/include/__thread/jthread.h @@ -28,6 +28,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN) _LIBCPP_BEGIN_NAMESPACE_STD @@ -127,4 +130,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN) +_LIBCPP_POP_MACROS + #endif // _LIBCPP___THREAD_JTHREAD_H diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h index 463bbd6772552c..0ecaac1b011bee 100644 --- a/libcxx/include/__thread/thread.h +++ b/libcxx/include/__thread/thread.h @@ -32,6 +32,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -251,4 +254,6 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(thread& __x, thread& __y) _NOEXCEPT { __x _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___THREAD_THREAD_H diff --git a/libcxx/include/array b/libcxx/include/array index dcb419f536dc50..41f016a4859a32 100644 --- a/libcxx/include/array +++ b/libcxx/include/array @@ -159,6 +159,9 @@ template const T&& get(const array&&) noexce # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -493,6 +496,8 @@ to_array(_Tp (&&__arr)[_Size]) noexcept(is_nothrow_move_constructible_v<_Tp>) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable index e375c986e7f12e..6aac3c13ef4a74 100644 --- a/libcxx/include/condition_variable +++ b/libcxx/include/condition_variable @@ -139,6 +139,9 @@ public: # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #ifndef _LIBCPP_HAS_NO_THREADS _LIBCPP_BEGIN_NAMESPACE_STD @@ -348,6 +351,8 @@ _LIBCPP_END_NAMESPACE_STD #endif // !_LIBCPP_HAS_NO_THREADS +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/csetjmp b/libcxx/include/csetjmp index d219c8e6cb2250..9012cad22ebe74 100644 --- a/libcxx/include/csetjmp +++ b/libcxx/include/csetjmp @@ -33,7 +33,13 @@ void longjmp(jmp_buf env, int val); #include <__assert> // all public C++ headers provide the assertion handler #include <__config> -#include +// is not provided by libc++ +#if __has_include() +# include +# ifdef _LIBCPP_SETJMP_H +# error "If libc++ starts defining , the __has_include check should move to libc++'s " +# endif +#endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator index 5bb1dd1ada6380..e9c1fb6924eced 100644 --- a/libcxx/include/experimental/iterator +++ b/libcxx/include/experimental/iterator @@ -64,6 +64,9 @@ namespace std { # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #if _LIBCPP_STD_VER >= 14 _LIBCPP_BEGIN_NAMESPACE_LFTS @@ -115,6 +118,8 @@ _LIBCPP_END_NAMESPACE_LFTS #endif // _LIBCPP_STD_VER >= 14 +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/future b/libcxx/include/future index 5602ae41c14235..4eeb401c9bbcda 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -401,6 +401,9 @@ template struct uses_allocator, Alloc>; # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // enum class future_errc @@ -2044,6 +2047,8 @@ inline shared_future future::share() _NOEXCEPT { return shared_futur _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 # include #endif diff --git a/libcxx/include/ios b/libcxx/include/ios index d36f5fb2ca2842..8465860d08dc14 100644 --- a/libcxx/include/ios +++ b/libcxx/include/ios @@ -242,6 +242,9 @@ storage-class-specifier const error_category& iostream_category() noexcept; # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD typedef ptrdiff_t streamsize; @@ -820,6 +823,8 @@ _LIBCPP_HIDE_FROM_ABI inline ios_base& defaultfloat(ios_base& __str) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/map b/libcxx/include/map index f122f2ebb15b52..2edbc0cf6245fb 100644 --- a/libcxx/include/map +++ b/libcxx/include/map @@ -617,6 +617,9 @@ erase_if(multimap& c, Predicate pred); // C++20 # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template # include diff --git a/libcxx/include/ostream b/libcxx/include/ostream index e2b2c0cbaaf254..180adda201d830 100644 --- a/libcxx/include/ostream +++ b/libcxx/include/ostream @@ -199,6 +199,9 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args); # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -1169,6 +1172,8 @@ println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/print b/libcxx/include/print index 7f2b5bac3dcf61..543a540ee4f27d 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -32,6 +32,7 @@ namespace std { */ #include <__assert> // all public C++ headers provide the assertion handler +#include <__availability> #include <__concepts/same_as.h> #include <__config> #include <__system_error/system_error.h> @@ -43,10 +44,6 @@ namespace std { #include #include -#if __has_include() -# include -#endif - #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif @@ -68,7 +65,8 @@ _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream); // Note the function is only implemented on the Windows platform. _LIBCPP_EXPORTED_FROM_ABI void __write_to_windows_console(FILE* __stream, wstring_view __view); # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS - +#elif __has_include() +_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream); #endif // _LIBCPP_WIN32API #if _LIBCPP_STD_VER >= 23 @@ -195,15 +193,17 @@ inline constexpr bool __use_unicode_execution_charset = _MSVC_EXECUTION_CHARACTE inline constexpr bool __use_unicode_execution_charset = true; # endif -_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal(FILE* __stream) { +_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal([[maybe_unused]] FILE* __stream) { // The macro _LIBCPP_TESTING_PRINT_IS_TERMINAL is used to change // the behavior in the test. This is not part of the public API. # ifdef _LIBCPP_TESTING_PRINT_IS_TERMINAL return _LIBCPP_TESTING_PRINT_IS_TERMINAL(__stream); +# elif _LIBCPP_AVAILABILITY_HAS_PRINT == 0 + return false; # elif defined(_LIBCPP_WIN32API) return std::__is_windows_terminal(__stream); # elif __has_include() - return isatty(fileno(__stream)); + return std::__is_posix_terminal(__stream); # else # error "Provide a way to determine whether a FILE* is a terminal" # endif diff --git a/libcxx/include/queue b/libcxx/include/queue index 692e38bb35229f..76ef85945662c4 100644 --- a/libcxx/include/queue +++ b/libcxx/include/queue @@ -283,6 +283,9 @@ template # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template > @@ -971,6 +974,8 @@ struct _LIBCPP_TEMPLATE_VIS uses_allocator # include diff --git a/libcxx/include/set b/libcxx/include/set index 55ba8f8208be1b..7f8245f8b605ab 100644 --- a/libcxx/include/set +++ b/libcxx/include/set @@ -552,6 +552,9 @@ erase_if(multiset& c, Predicate pred); // C++20 # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -1488,6 +1491,8 @@ using multiset _LIBCPP_AVAILABILITY_PMR = std::multiset<_KeyT, _CompareT, polymo _LIBCPP_END_NAMESPACE_STD #endif +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/stack b/libcxx/include/stack index 546380b0aacd07..f1f6ee8482fd21 100644 --- a/libcxx/include/stack +++ b/libcxx/include/stack @@ -138,6 +138,9 @@ template # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template > @@ -366,6 +369,8 @@ struct _LIBCPP_TEMPLATE_VIS uses_allocator, _Alloc> : pub _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/stddef.h b/libcxx/include/stddef.h index 887776b150e49d..1583e78e3739ba 100644 --- a/libcxx/include/stddef.h +++ b/libcxx/include/stddef.h @@ -7,18 +7,6 @@ // //===----------------------------------------------------------------------===// -#if defined(__need_ptrdiff_t) || defined(__need_size_t) || defined(__need_wchar_t) || defined(__need_NULL) || \ - defined(__need_wint_t) - -# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -# endif - -# include_next - -#elif !defined(_LIBCPP_STDDEF_H) -# define _LIBCPP_STDDEF_H - /* stddef.h synopsis @@ -36,15 +24,18 @@ */ -# include <__config> +#include <__config> -# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -# endif +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif -# if __has_include_next() -# include_next -# endif +// Note: This include is outside of header guards because we sometimes get included multiple times +// with different defines and the underlying will know how to deal with that. +#include_next + +#ifndef _LIBCPP_STDDEF_H +# define _LIBCPP_STDDEF_H # ifdef __cplusplus typedef decltype(nullptr) nullptr_t; diff --git a/libcxx/include/string b/libcxx/include/string index e97139206d4fa7..ba169c3dbfc9e6 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -1937,12 +1937,7 @@ private: return (__s + (__a - 1)) & ~(__a - 1); } enum { - __alignment = -#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT - 8 -#else - 16 -#endif + __alignment = 8 }; static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __s) _NOEXCEPT { if (__s < __min_cap) { diff --git a/libcxx/include/strstream b/libcxx/include/strstream index 7843184e4da4f8..e20c86baa6dfc5 100644 --- a/libcxx/include/strstream +++ b/libcxx/include/strstream @@ -139,6 +139,9 @@ private: # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD class _LIBCPP_DEPRECATED _LIBCPP_EXPORTED_FROM_ABI strstreambuf : public streambuf { @@ -340,4 +343,6 @@ private: _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP_STRSTREAM diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map index 4be25fc1cdd8fe..2c1782dc879e65 100644 --- a/libcxx/include/unordered_map +++ b/libcxx/include/unordered_map @@ -625,6 +625,9 @@ template # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template # include diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set index 6414885f4c514a..50b616907f0052 100644 --- a/libcxx/include/unordered_set +++ b/libcxx/include/unordered_set @@ -570,6 +570,9 @@ template # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template @@ -1810,6 +1813,8 @@ using unordered_multiset _LIBCPP_AVAILABILITY_PMR = _LIBCPP_END_NAMESPACE_STD #endif +_LIBCPP_POP_MACROS + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/version b/libcxx/include/version index 9e26da8c1b2425..d356976d6454ad 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -266,7 +266,9 @@ __cpp_lib_within_lifetime 202306L # define __cpp_lib_make_reverse_iterator 201402L # define __cpp_lib_make_unique 201304L # define __cpp_lib_null_iterators 201304L -# define __cpp_lib_quoted_string_io 201304L +# if !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# define __cpp_lib_quoted_string_io 201304L +# endif # define __cpp_lib_result_of_sfinae 201210L # define __cpp_lib_robust_nonmodifying_seq_ops 201304L # if !defined(_LIBCPP_HAS_NO_THREADS) @@ -294,7 +296,7 @@ __cpp_lib_within_lifetime 202306L # define __cpp_lib_clamp 201603L # define __cpp_lib_enable_shared_from_this 201603L // # define __cpp_lib_execution 201603L -# if _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY # define __cpp_lib_filesystem 201703L # endif # define __cpp_lib_gcd_lcm 201606L @@ -323,7 +325,9 @@ __cpp_lib_within_lifetime 202306L // # define __cpp_lib_parallel_algorithm 201603L # define __cpp_lib_raw_memory_algorithms 201606L # define __cpp_lib_sample 201603L -# define __cpp_lib_scoped_lock 201703L +# if !defined(_LIBCPP_HAS_NO_THREADS) +# define __cpp_lib_scoped_lock 201703L +# endif # if !defined(_LIBCPP_HAS_NO_THREADS) # define __cpp_lib_shared_mutex 201505L # endif @@ -496,7 +500,9 @@ __cpp_lib_within_lifetime 202306L // # define __cpp_lib_freestanding_optional 202311L // # define __cpp_lib_freestanding_string_view 202311L // # define __cpp_lib_freestanding_variant 202311L -# define __cpp_lib_fstream_native_handle 202306L +# if !defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# define __cpp_lib_fstream_native_handle 202306L +# endif // # define __cpp_lib_function_ref 202306L // # define __cpp_lib_hazard_pointer 202306L // # define __cpp_lib_linalg 202311L diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT index 1179c253f18c8f..7ff604959f4d5c 100644 --- a/libcxx/lib/abi/CHANGELOG.TXT +++ b/libcxx/lib/abi/CHANGELOG.TXT @@ -16,6 +16,14 @@ New entries should be added directly below the "Version" header. Version 18.0 ------------ +* [libc++] Moves is_terminal to the dylib + + The patch moves the POSIX implementation of is_terminal to the dylib. This is + needed to avoid using in public headers. + + All platforms + Symbol added: _ZNSt6__ndk119__is_posix_terminalEP7__sFILE + * [libc++abi] Implement __cxa_init_primary_exception and use it to optimize std::make_exception_ptr (#65534) This patch implements __cxa_init_primary_exception, an extension to the Itanium C++ ABI. diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index c2fea4d8adb420..2064f45bf8c084 100644 --- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1495,6 +1495,7 @@ {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index a60f099b532052..fec3a4505a0c6d 100644 --- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1176,6 +1176,7 @@ {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index a159ff52218667..e52cf98dd4c4f1 100644 --- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -534,6 +534,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index 5749a7520f9bac..52a04706ddf20b 100644 --- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -534,6 +534,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index e827114f169197..bced6b2ea81ba5 100644 --- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1495,6 +1495,7 @@ {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index f4077adc074e0a..efa2189e9c9287 100644 --- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1176,6 +1176,7 @@ {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist index e3d3fcb35d8403..ebda5b0dfba57d 100644 --- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1190,6 +1190,7 @@ {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist index 16923301d2548e..6432ad3be35859 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1188,6 +1188,7 @@ {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist index 2380ffb100de97..1fe84e17b3f7f0 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist @@ -1159,6 +1159,7 @@ {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'} diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt index 0388c048dacb8b..d47d19a4755317 100644 --- a/libcxx/modules/CMakeLists.txt +++ b/libcxx/modules/CMakeLists.txt @@ -137,6 +137,25 @@ set(LIBCXX_MODULE_STD_COMPAT_SOURCES std.compat/cwctype.inc ) +# TODO MODULES the CMakeLists.txt in the build directory is only temporary. +# This allows using as available in the build directory. Once build systems +# have proper support for the installed files this will be removed. +if ("${LIBCXX_GENERATED_INCLUDE_DIR}" STREQUAL "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}") + # This typically happens when the target is not installed. + set(LIBCXX_CONFIGURED_INCLUDE_DIRS "${LIBCXX_GENERATED_INCLUDE_DIR}") +else() + # It's important that the arch directory be included first so that its header files + # which interpose on the default include dir be included instead of the default ones. + set(LIBCXX_CONFIGURED_INCLUDE_DIRS + "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR};${LIBCXX_GENERATED_INCLUDE_DIR}" + ) +endif() +configure_file( + "CMakeLists.txt.in" + "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt" + @ONLY +) + set(LIBCXX_MODULE_STD_INCLUDE_SOURCES) foreach(file ${LIBCXX_MODULE_STD_SOURCES}) set( @@ -166,6 +185,7 @@ configure_file( ) set(_all_modules) +list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt") list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.cppm") list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.compat.cppm") foreach(file ${LIBCXX_MODULE_STD_SOURCES} ${LIBCXX_MODULE_STD_COMPAT_SOURCES}) @@ -186,9 +206,20 @@ add_custom_target(generate-cxx-modules # Configure the modules manifest. # Use the relative path between the installation and the module in the json # file. This allows moving the entire installation to a different location. +if("${CMAKE_INSTALL_PREFIX}" STREQUAL "") + set(BASE_DIRECTORY "/") +else() + set(BASE_DIRECTORY ${CMAKE_INSTALL_PREFIX}) +endif() +cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_LIBRARY_DIR + BASE_DIRECTORY ${BASE_DIRECTORY} + OUTPUT_VARIABLE ABS_LIBRARY_DIR) +cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_MODULES_DIR + BASE_DIRECTORY ${BASE_DIRECTORY} + OUTPUT_VARIABLE ABS_MODULES_DIR) file(RELATIVE_PATH LIBCXX_MODULE_RELATIVE_PATH - ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_LIBRARY_DIR} - ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_MODULES_DIR}) + ${ABS_LIBRARY_DIR} + ${ABS_MODULES_DIR}) configure_file( "modules.json.in" "${LIBCXX_LIBRARY_DIR}/libc++.modules.json" diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in new file mode 100644 index 00000000000000..e332d70cc16333 --- /dev/null +++ b/libcxx/modules/CMakeLists.txt.in @@ -0,0 +1,88 @@ +cmake_minimum_required(VERSION 3.26) + +project(libc++-modules LANGUAGES CXX) + +# Enable CMake's module support +if(CMAKE_VERSION VERSION_LESS "3.28.0") + if(CMAKE_VERSION VERSION_LESS "3.27.0") + set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "2182bf5c-ef0d-489a-91da-49dbc3090d2a") + else() + set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "aa1f7df0-828a-4fcd-9afc-2dc80491aca7") + endif() + set(CMAKE_EXPERIMENTAL_CXX_MODULE_DYNDEP 1) +else() + cmake_policy(VERSION 3.28) +endif() + +# Default to C++ extensions being off. Libc++'s modules support have trouble +# with extensions right now. +set(CMAKE_CXX_EXTENSIONS OFF) + +# Propagates the CMake options to the modules. +# +# This uses the std module hard-coded since the std.compat module does not +# depend on these flags. +macro(compile_define_if_not condition def) + if (NOT ${condition}) + target_compile_definitions(std PRIVATE ${def}) + endif() +endmacro() +macro(compile_define_if condition def) + if (${condition}) + target_compile_definitions(std PRIVATE ${def}) + endif() +endmacro() + +### STD + +add_library(std) +target_sources(std + PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES + std.cppm +) + +target_include_directories(std SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@) + +if (NOT @LIBCXX_ENABLE_EXCEPTIONS@) + target_compile_options(std PUBLIC -fno-exceptions) +endif() + +target_compile_options(std + PUBLIC + -nostdinc++ + -Wno-reserved-module-identifier + -Wno-reserved-user-defined-literal + @LIBCXX_COMPILE_FLAGS@ +) +set_target_properties(std + PROPERTIES + OUTPUT_NAME "c++std" +) + +### STD.COMPAT + +add_library(std.compat) +target_sources(std.compat + PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES + std.compat.cppm +) + +target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@) + +if (NOT @LIBCXX_ENABLE_EXCEPTIONS@) + target_compile_options(std.compat PUBLIC -fno-exceptions) +endif() + +target_compile_options(std.compat + PUBLIC + -nostdinc++ + -Wno-reserved-module-identifier + -Wno-reserved-user-defined-literal + -fmodule-file=std=${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/std.dir/std.pcm + @LIBCXX_COMPILE_FLAGS@ +) +set_target_properties(std.compat + PROPERTIES + OUTPUT_NAME "c++std.compat" +) +add_dependencies(std.compat std) diff --git a/libcxx/modules/modules.json.in b/libcxx/modules/modules.json.in index ddc377f28f9194..759ac92d81f18e 100644 --- a/libcxx/modules/modules.json.in +++ b/libcxx/modules/modules.json.in @@ -5,7 +5,7 @@ { "logical-name": "std", "source-path": "@LIBCXX_MODULE_RELATIVE_PATH@/std.cppm", - "is-standard-library": true, + "is-std-library": true, "local-arguments": { "system-include-directories": [ "@LIBCXX_MODULE_RELATIVE_PATH@" diff --git a/libcxx/modules/std.compat/cstdlib.inc b/libcxx/modules/std.compat/cstdlib.inc index a45a0a1caf8ba9..4783cbf5162390 100644 --- a/libcxx/modules/std.compat/cstdlib.inc +++ b/libcxx/modules/std.compat/cstdlib.inc @@ -25,7 +25,7 @@ export { using ::system; // [c.malloc], C library memory allocation - using ::aligned_alloc; + using ::aligned_alloc _LIBCPP_USING_IF_EXISTS; using ::calloc; using ::free; using ::malloc; diff --git a/libcxx/modules/std/atomic.inc b/libcxx/modules/std/atomic.inc index 5139b7531093d7..88b31ccdb20840 100644 --- a/libcxx/modules/std/atomic.inc +++ b/libcxx/modules/std/atomic.inc @@ -60,7 +60,9 @@ export namespace std { using std::atomic_char; using std::atomic_char16_t; using std::atomic_char32_t; +#ifndef _LIBCPP_HAS_NO_CHAR8_T using std::atomic_char8_t; +#endif using std::atomic_int; using std::atomic_llong; using std::atomic_long; diff --git a/libcxx/modules/std/iosfwd.inc b/libcxx/modules/std/iosfwd.inc index ec8b434ca0c51b..410fb6aefed801 100644 --- a/libcxx/modules/std/iosfwd.inc +++ b/libcxx/modules/std/iosfwd.inc @@ -14,7 +14,9 @@ export namespace std { #endif using std::u16streampos; using std::u32streampos; +#ifndef _LIBCPP_HAS_NO_CHAR8_T using std::u8streampos; +#endif using std::basic_osyncstream; using std::basic_syncbuf; diff --git a/libcxx/modules/std/ostream.inc b/libcxx/modules/std/ostream.inc index 8fcbfb4bdc1828..0e0e2d54fe6bae 100644 --- a/libcxx/modules/std/ostream.inc +++ b/libcxx/modules/std/ostream.inc @@ -33,8 +33,10 @@ export namespace std { using std::println; using std::vprint_nonunicode; +# ifndef _LIBCPP_HAS_NO_UNICODE using std::vprint_unicode; -# endif // _LIBCPP_STD_VER >= 23 +# endif // _LIBCPP_HAS_NO_UNICODE +# endif // _LIBCPP_STD_VER >= 23 #endif // _LIBCPP_HAS_NO_LOCALIZATION } // namespace std diff --git a/libcxx/modules/std/string.inc b/libcxx/modules/std/string.inc index c83ee7643f87e9..9808a96215a182 100644 --- a/libcxx/modules/std/string.inc +++ b/libcxx/modules/std/string.inc @@ -34,7 +34,9 @@ export namespace std { using std::string; using std::u16string; using std::u32string; +#ifndef _LIBCPP_HAS_NO_CHAR8_T using std::u8string; +#endif #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS using std::wstring; #endif @@ -58,7 +60,9 @@ export namespace std { using std::pmr::string; using std::pmr::u16string; using std::pmr::u32string; +#ifndef _LIBCPP_HAS_NO_CHAR8_T using std::pmr::u8string; +#endif #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS using std::pmr::wstring; #endif diff --git a/libcxx/modules/std/string_view.inc b/libcxx/modules/std/string_view.inc index 1fa63a77395358..f4f9d80ddb83da 100644 --- a/libcxx/modules/std/string_view.inc +++ b/libcxx/modules/std/string_view.inc @@ -27,7 +27,9 @@ export namespace std { using std::string_view; using std::u16string_view; using std::u32string_view; +#ifndef _LIBCPP_HAS_NO_CHAR8_T using std::u8string_view; +#endif #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS using std::wstring_view; #endif diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 44a088663463c9..1b80625304a412 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -306,7 +306,10 @@ if (LIBCXX_ENABLE_STATIC) # then its code shouldn't declare them with hidden visibility. They might # actually be provided by a shared library at link time. if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS) - append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden) + append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden) + if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG) + append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden) + endif() endif() target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS}) # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp index 3692187a5954a3..8fa59fdd097bcd 100644 --- a/libcxx/src/print.cpp +++ b/libcxx/src/print.cpp @@ -8,22 +8,26 @@ #include <__config> -#if defined(_LIBCPP_WIN32API) +#include +#include + +#include <__system_error/system_error.h> -# include -# include +#include "filesystem/error.h" +#if defined(_LIBCPP_WIN32API) # define WIN32_LEAN_AND_MEAN # define NOMINMAX # include # include - -# include <__system_error/system_error.h> - -# include "filesystem/error.h" +#elif __has_include() +# include +#endif _LIBCPP_BEGIN_NAMESPACE_STD +#if defined(_LIBCPP_WIN32API) + _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream) { // Note the Standard does this in one call, but it's unclear whether // an invalid handle is allowed when calling GetConsoleMode. @@ -52,6 +56,9 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst } # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS -_LIBCPP_END_NAMESPACE_STD +#elif __has_include() // !_LIBCPP_WIN32API -#endif // !_LIBCPP_WIN32API +_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isatty(fileno(__stream)); } +#endif + +_LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp index c7df56c815a805..1110e3d3ec568a 100644 --- a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp +++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.13|10.15|11.0}} + // // This test demonstrates the smaller allocation sizes when the alignment @@ -17,14 +19,8 @@ #include "test_macros.h" -// alignment of the string heap buffer is hardcoded to either 16 or 8 - -const std::size_t alignment = -#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT - 8; -#else - 16; -#endif +// alignment of the string heap buffer is hardcoded to 8 +const std::size_t alignment = 8; int main(int, char**) { std::string input_string; diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp index a3cb79522f2e1e..726570beb6d1ae 100644 --- a/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp +++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp @@ -17,14 +17,8 @@ #include "test_macros.h" -// alignment of the string heap buffer is hardcoded to 16 - -static const std::size_t alignment = -#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT - 8; -#else - 16; -#endif +// alignment of the string heap buffer is hardcoded to 8 +static const std::size_t alignment = 8; template TEST_CONSTEXPR_CXX20 void full_size() { diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py index 8ddd035ff2d258..5b75dba544ef20 100644 --- a/libcxx/test/libcxx/system_reserved_names.gen.py +++ b/libcxx/test/libcxx/system_reserved_names.gen.py @@ -7,7 +7,8 @@ #===----------------------------------------------------------------------===## # Test that headers are not tripped up by the surrounding code defining various -# alphabetic macros. +# alphabetic macros. Also ensure that we don't swallow the definition of user +# provided macros (in other words, ensure that we push/pop correctly everywhere). # RUN: %{python} %s %{libcxx}/utils @@ -162,4 +163,13 @@ #define refresh SYSTEM_RESERVED_NAME #include <{header}> + +// Make sure we don't swallow the definition of the macros we push/pop +#define STRINGIFY_IMPL(x) #x +#define STRINGIFY(x) STRINGIFY_IMPL(x) +static_assert(__builtin_strcmp(STRINGIFY(min), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, ""); +static_assert(__builtin_strcmp(STRINGIFY(max), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, ""); +static_assert(__builtin_strcmp(STRINGIFY(move), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, ""); +static_assert(__builtin_strcmp(STRINGIFY(erase), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, ""); +static_assert(__builtin_strcmp(STRINGIFY(refresh), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, ""); """) diff --git a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp new file mode 100644 index 00000000000000..c55a0a4d6e5d1b --- /dev/null +++ b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: stdlib=apple-libc++ + +// Test that using -pedantic-errors doesn't turn off availability annotations. +// This used to be the case because we used __has_extension(...) to enable the +// availability annotations, and -pedantic-errors changes the behavior of +// __has_extension(...) in an incompatible way. + +// ADDITIONAL_COMPILE_FLAGS: -pedantic-errors + +#include <__availability> + +#if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS) +# error Availability annotations should be enabled on Apple platforms in the system configuration! +#endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp index 46ccde800c1796..3f03e8be9aeab3 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp @@ -51,7 +51,7 @@ # error "__cpp_lib_char8_t should not be defined before c++20" # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++17" # endif @@ -60,7 +60,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif @@ -79,7 +79,7 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++20" # endif @@ -88,7 +88,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif @@ -107,7 +107,7 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++23" # endif @@ -116,7 +116,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif @@ -135,7 +135,7 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++26" # endif @@ -144,7 +144,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp index eab0313b2c1ef7..f2c31b60a92f90 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp @@ -56,11 +56,17 @@ #elif TEST_STD_VER > 23 -# ifndef __cpp_lib_fstream_native_handle -# error "__cpp_lib_fstream_native_handle should be defined in c++26" -# endif -# if __cpp_lib_fstream_native_handle != 202306L -# error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26" +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)) +# ifndef __cpp_lib_fstream_native_handle +# error "__cpp_lib_fstream_native_handle should be defined in c++26" +# endif +# if __cpp_lib_fstream_native_handle != 202306L +# error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26" +# endif +# else +# ifdef __cpp_lib_fstream_native_handle +# error "__cpp_lib_fstream_native_handle should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))' is not met!" +# endif # endif #endif // TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp index b678e6804e959c..da9970fd162787 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp @@ -32,47 +32,77 @@ #elif TEST_STD_VER == 14 -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++14" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++14" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++14" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++14" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif #elif TEST_STD_VER == 17 -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++17" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++17" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++17" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++17" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif #elif TEST_STD_VER == 20 -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++20" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++20" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++20" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++20" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif #elif TEST_STD_VER == 23 -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++23" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++23" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++23" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++23" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif #elif TEST_STD_VER > 23 -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++26" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++26" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++26" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++26" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif #endif // TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp index e86239f534790e..0f279973dccd8a 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp @@ -38,38 +38,62 @@ #elif TEST_STD_VER == 17 -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++17" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++17" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++17" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++17" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif #elif TEST_STD_VER == 20 -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++20" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++20" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++20" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++20" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif #elif TEST_STD_VER == 23 -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++23" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++23" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++23" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++23" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif #elif TEST_STD_VER > 23 -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++26" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++26" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++26" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++26" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif #endif // TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index c319940fe6e49c..41eb7f560213f8 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -1571,11 +1571,17 @@ # error "__cpp_lib_print should not be defined before c++23" # endif -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++14" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++14" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++14" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++14" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif # ifdef __cpp_lib_ranges @@ -2183,7 +2189,7 @@ # error "__cpp_lib_expected should not be defined before c++23" # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++17" # endif @@ -2192,7 +2198,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif @@ -2568,11 +2574,17 @@ # error "__cpp_lib_print should not be defined before c++23" # endif -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++17" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++17" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++17" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++17" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif # ifdef __cpp_lib_ranges @@ -2671,11 +2683,17 @@ # error "__cpp_lib_saturation_arithmetic should not be defined before c++26" # endif -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++17" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++17" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++17" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++17" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif # ifdef __cpp_lib_semaphore @@ -3366,7 +3384,7 @@ # error "__cpp_lib_expected should not be defined before c++23" # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++20" # endif @@ -3375,7 +3393,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif @@ -3835,11 +3853,17 @@ # error "__cpp_lib_print should not be defined before c++23" # endif -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++20" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++20" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++20" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++20" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif # ifndef __cpp_lib_ranges @@ -3944,11 +3968,17 @@ # error "__cpp_lib_saturation_arithmetic should not be defined before c++26" # endif -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++20" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++20" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++20" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++20" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif # if !defined(_LIBCPP_HAS_NO_THREADS) && (!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_SYNC) @@ -4750,7 +4780,7 @@ # error "__cpp_lib_expected should have the value 202211L in c++23" # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++23" # endif @@ -4759,7 +4789,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif @@ -5267,11 +5297,17 @@ # error "__cpp_lib_print should have the value 202207L in c++23" # endif -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++23" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++23" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++23" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++23" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif # ifndef __cpp_lib_ranges @@ -5460,11 +5496,17 @@ # error "__cpp_lib_saturation_arithmetic should not be defined before c++26" # endif -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++23" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++23" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++23" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++23" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif # if !defined(_LIBCPP_HAS_NO_THREADS) && (!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_SYNC) @@ -6347,7 +6389,7 @@ # error "__cpp_lib_expected should have the value 202211L in c++26" # endif -# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY) # ifndef __cpp_lib_filesystem # error "__cpp_lib_filesystem should be defined in c++26" # endif @@ -6356,7 +6398,7 @@ # endif # else # ifdef __cpp_lib_filesystem -# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!" +# error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!" # endif # endif @@ -6511,11 +6553,17 @@ # endif # endif -# ifndef __cpp_lib_fstream_native_handle -# error "__cpp_lib_fstream_native_handle should be defined in c++26" -# endif -# if __cpp_lib_fstream_native_handle != 202306L -# error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26" +# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)) +# ifndef __cpp_lib_fstream_native_handle +# error "__cpp_lib_fstream_native_handle should be defined in c++26" +# endif +# if __cpp_lib_fstream_native_handle != 202306L +# error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26" +# endif +# else +# ifdef __cpp_lib_fstream_native_handle +# error "__cpp_lib_fstream_native_handle should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))' is not met!" +# endif # endif # if !defined(_LIBCPP_VERSION) @@ -6966,11 +7014,17 @@ # error "__cpp_lib_print should have the value 202207L in c++26" # endif -# ifndef __cpp_lib_quoted_string_io -# error "__cpp_lib_quoted_string_io should be defined in c++26" -# endif -# if __cpp_lib_quoted_string_io != 201304L -# error "__cpp_lib_quoted_string_io should have the value 201304L in c++26" +# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION) +# ifndef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should be defined in c++26" +# endif +# if __cpp_lib_quoted_string_io != 201304L +# error "__cpp_lib_quoted_string_io should have the value 201304L in c++26" +# endif +# else +# ifdef __cpp_lib_quoted_string_io +# error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!" +# endif # endif # ifndef __cpp_lib_ranges @@ -7174,11 +7228,17 @@ # error "__cpp_lib_saturation_arithmetic should have the value 202311L in c++26" # endif -# ifndef __cpp_lib_scoped_lock -# error "__cpp_lib_scoped_lock should be defined in c++26" -# endif -# if __cpp_lib_scoped_lock != 201703L -# error "__cpp_lib_scoped_lock should have the value 201703L in c++26" +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should be defined in c++26" +# endif +# if __cpp_lib_scoped_lock != 201703L +# error "__cpp_lib_scoped_lock should have the value 201703L in c++26" +# endif +# else +# ifdef __cpp_lib_scoped_lock +# error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!" +# endif # endif # if !defined(_LIBCPP_HAS_NO_THREADS) && (!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_SYNC) diff --git a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp index 52dbde45dbb265..32ce1c8bf617dc 100644 --- a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp +++ b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp @@ -7,6 +7,12 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: no-exceptions + +// After changing the alignment of the allocated pointer from 16 to 8, the exception thrown is no longer `bad_alloc` +// but instead length_error on systems using new headers but older dylibs. +// +// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.13|10.15|11.0}} + // // size_type max_size() const; // constexpr since C++20 diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp index 30fdb19fd3aebe..3b2d093eb34d49 100644 --- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp +++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp @@ -121,26 +121,7 @@ int main(int, char**) test_pair_rv(); test_pair_rv(); - /* For ExplicitTypes::CopyOnly, two of the viable candidates for initializing from a non-const xvalue are: - * pair(const pair&); // (defaulted copy constructor) - * template explicit pair(const pair&&); [U1 = ExplicitTypes::CopyOnly, U2 = int] - * This results in diverging behavior for test_convertible which uses copy-list-initialization - * Prior to CWG2137, this would have selected the first (non-explicit) ctor as explicit ctors would not be considered - * Afterwards, it should select the second since it is a better match, and then failed because it is explicit - * - * This may change with future defect reports, and some compilers only have partial support for CWG2137, - * so use std::is_convertible directly to avoid a copy-list-initialization - */ - { - using P1 = std::pair; - using P2 = std::pair; - using UP1 = std::pair&&; - using UP2 = std::pair&&; - static_assert(std::is_constructible::value, ""); - static_assert(std::is_convertible::value, ""); - static_assert(std::is_constructible::value, ""); - static_assert(std::is_convertible::value, ""); - } + test_pair_rv(); test_pair_rv(); test_pair_rv(); diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt index 978e7095216522..a52140e2b9938a 100644 --- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt +++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt @@ -5,7 +5,7 @@ set(LLVM_DIR_SAVE ${LLVM_DIR}) set(Clang_DIR_SAVE ${Clang_DIR}) -find_package(Clang 18) +find_package(Clang 18.1) if (NOT Clang_FOUND) find_package(Clang 17) endif() diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index b7d95d451f2137..9825d4c8ec1df1 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -456,8 +456,8 @@ def add_version_header(tc): "name": "__cpp_lib_filesystem", "values": {"c++17": 201703}, "headers": ["filesystem"], - "test_suite_guard": "!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY", - "libcxx_guard": "_LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY", + "test_suite_guard": "!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)", + "libcxx_guard": "!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY", }, { "name": "__cpp_lib_format", @@ -570,6 +570,8 @@ def add_version_header(tc): "name": "__cpp_lib_fstream_native_handle", "values": {"c++26": 202306}, # P1759R6 Native handles and file streams "headers": ["fstream"], + "test_suite_guard": "!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))", + "libcxx_guard": "!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)", }, { "name": "__cpp_lib_function_ref", @@ -879,6 +881,8 @@ def add_version_header(tc): "name": "__cpp_lib_quoted_string_io", "values": {"c++14": 201304}, "headers": ["iomanip"], + "test_suite_guard": "!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "libcxx_guard": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", }, { "name": "__cpp_lib_ranges", @@ -1016,6 +1020,8 @@ def add_version_header(tc): "name": "__cpp_lib_scoped_lock", "values": {"c++17": 201703}, "headers": ["mutex"], + "test_suite_guard": "!defined(_LIBCPP_HAS_NO_THREADS)", + "libcxx_guard": "!defined(_LIBCPP_HAS_NO_THREADS)", }, { "name": "__cpp_lib_semaphore", diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt index 4198827203fc8b..f4722c3b352d4d 100644 --- a/libcxxabi/src/CMakeLists.txt +++ b/libcxxabi/src/CMakeLists.txt @@ -265,7 +265,10 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY) # then its code shouldn't declare them with hidden visibility. They might # actually be provided by a shared library at link time. if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS) - target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden) + target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden) + if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG) + target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden) + endif() endif() # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in libcxx's # __config_site too. Define it in the same way here, to avoid redefinition diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt index bb1b052f61d875..806d5a783ec39c 100644 --- a/libunwind/CMakeLists.txt +++ b/libunwind/CMakeLists.txt @@ -21,6 +21,7 @@ set(LIBUNWIND_LIBCXX_PATH "${CMAKE_CURRENT_LIST_DIR}/../libcxx" CACHE PATH "Specify path to libc++ source.") include(GNUInstallDirs) +include(CheckSymbolExists) #=============================================================================== # Setup CMake Options @@ -96,6 +97,20 @@ endif() option(LIBUNWIND_HIDE_SYMBOLS "Do not export any symbols from the static library." ${LIBUNWIND_DEFAULT_HIDE_SYMBOLS}) +# If toolchain is FPXX, we switch to FP64 to save the full FPRs. See: +# https://web.archive.org/web/20180828210612/https://dmz-portal.mips.com/wiki/MIPS_O32_ABI_-_FR0_and_FR1_Interlinking +check_symbol_exists(__mips_hard_float "" __MIPSHF) +check_symbol_exists(_ABIO32 "" __MIPS_O32) +if (__MIPSHF AND __MIPS_O32) + file(WRITE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/mips_is_fpxx.c + "#if __mips_fpr != 0\n" + "# error\n" + "#endif\n") + try_compile(MIPS_FPABI_FPXX ${CMAKE_BINARY_DIR} + ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/mips_is_fpxx.c + CMAKE_FLAGS -DCMAKE_C_LINK_EXECUTABLE='echo') +endif() + #=============================================================================== # Configure System #=============================================================================== @@ -179,6 +194,10 @@ if (WIN32) add_compile_flags_if_supported(-Wno-dll-attribute-on-redeclaration) endif() +if (MIPS_FPABI_FPXX) + add_compile_flags(-mfp64) +endif() + # Get feature flags. # Exceptions # Catches C++ exceptions only and tells the compiler to assume that extern C diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt index 9c6f5d908b0945..780430ba70ba60 100644 --- a/libunwind/src/CMakeLists.txt +++ b/libunwind/src/CMakeLists.txt @@ -201,7 +201,10 @@ set_target_properties(unwind_static_objects if(LIBUNWIND_HIDE_SYMBOLS) target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility=hidden) - target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden) + target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden) + if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG) + target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden) + endif() target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS) endif() diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index 39f4575031be54..e2074932bc466e 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -652,6 +652,13 @@ void SectionChunk::getRuntimePseudoRelocs( dyn_cast_or_null(file->getSymbol(rel.SymbolTableIndex)); if (!target || !target->isRuntimePseudoReloc) continue; + // If the target doesn't have a chunk allocated, it may be a + // DefinedImportData symbol which ended up unnecessary after GC. + // Normally we wouldn't eliminate section chunks that are referenced, but + // references within DWARF sections don't count for keeping section chunks + // alive. Thus such dangling references in DWARF sections are expected. + if (!target->getChunk()) + continue; int sizeInBits = getRuntimePseudoRelocSize(rel.Type, file->ctx.config.machine); if (sizeInBits == 0) { diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp index 6b516d8c6d5ef8..c4388ba9e40d0b 100644 --- a/lld/COFF/DLL.cpp +++ b/lld/COFF/DLL.cpp @@ -172,7 +172,7 @@ binImports(COFFLinkerContext &ctx, // A chunk for the delay import descriptor table etnry. class DelayDirectoryChunk : public NonSectionChunk { public: - explicit DelayDirectoryChunk(Chunk *n) : dllName(n) {} + explicit DelayDirectoryChunk(Chunk *n) : dllName(n) { setAlignment(4); } size_t getSize() const override { return sizeof(delay_import_directory_table_entry); diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index e0afb6b18805b2..22ee2f133be98a 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1825,7 +1825,15 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { } } else { config->repro = false; - config->timestamp = time(nullptr); + if (std::optional epoch = + Process::GetEnv("SOURCE_DATE_EPOCH")) { + StringRef value(*epoch); + if (value.getAsInteger(0, config->timestamp)) + fatal(Twine("invalid SOURCE_DATE_EPOCH timestamp: ") + value + + ". Expected 32-bit integer"); + } else { + config->timestamp = time(nullptr); + } } // Handle /alternatename diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index ab2ec5b447d000..464f5dfb320ccc 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -11,6 +11,7 @@ #include "Symbols.h" #include "SyntheticSections.h" #include "Target.h" +#include "llvm/Support/LEB128.h" using namespace llvm; using namespace llvm::object; @@ -36,9 +37,12 @@ class LoongArch final : public TargetInfo { bool usesOnlyLowPageBits(RelType type) const override; void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; + bool relaxOnce(int pass) const override; + void finalizeRelax(int passes) const override; }; } // end anonymous namespace +namespace { enum Op { SUB_W = 0x00110000, SUB_D = 0x00118000, @@ -63,6 +67,7 @@ enum Reg { R_T2 = 14, R_T3 = 15, }; +} // namespace // Mask out the input's lowest 12 bits for use with `pcalau12i`, in sequences // like `pcalau12i + addi.[wd]` or `pcalau12i + {ld,st}.*` where the `pcalau12i` @@ -151,6 +156,17 @@ static bool isJirl(uint32_t insn) { return (insn & 0xfc000000) == JIRL; } +static void handleUleb128(uint8_t *loc, uint64_t val) { + const uint32_t maxcount = 1 + 64 / 7; + uint32_t count; + const char *error = nullptr; + uint64_t orig = decodeULEB128(loc, &count, nullptr, &error); + if (count > maxcount || (count == maxcount && error)) + errorOrWarn(getErrorLocation(loc) + "extra space for uleb128"); + uint64_t mask = count < maxcount ? (1ULL << 7 * count) - 1 : -1ULL; + encodeULEB128((orig + val) & mask, loc, count); +} + LoongArch::LoongArch() { // The LoongArch ISA itself does not have a limit on page sizes. According to // the ISA manual, the PS (page size) field in MTLB entries and CSR.STLBPS is @@ -392,11 +408,13 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s, case R_LARCH_ADD16: case R_LARCH_ADD32: case R_LARCH_ADD64: + case R_LARCH_ADD_ULEB128: case R_LARCH_SUB6: case R_LARCH_SUB8: case R_LARCH_SUB16: case R_LARCH_SUB32: case R_LARCH_SUB64: + case R_LARCH_SUB_ULEB128: // The LoongArch add/sub relocs behave like the RISCV counterparts; reuse // the RelExpr to avoid code duplication. return R_RISCV_ADD; @@ -465,8 +483,9 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s, case R_LARCH_TLS_GD_HI20: return R_TLSGD_GOT; case R_LARCH_RELAX: - // LoongArch linker relaxation is not implemented yet. - return R_NONE; + return config->relax ? R_RELAX_HINT : R_NONE; + case R_LARCH_ALIGN: + return R_RELAX_HINT; // Other known relocs that are explicitly unimplemented: // @@ -630,6 +649,9 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel, case R_LARCH_ADD64: write64le(loc, read64le(loc) + val); return; + case R_LARCH_ADD_ULEB128: + handleUleb128(loc, val); + return; case R_LARCH_SUB6: *loc = (*loc & 0xc0) | ((*loc - val) & 0x3f); return; @@ -645,6 +667,9 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel, case R_LARCH_SUB64: write64le(loc, read64le(loc) - val); return; + case R_LARCH_SUB_ULEB128: + handleUleb128(loc, -val); + return; case R_LARCH_MARK_LA: case R_LARCH_MARK_PCREL: @@ -659,6 +684,155 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel, } } +static bool relax(InputSection &sec) { + const uint64_t secAddr = sec.getVA(); + const MutableArrayRef relocs = sec.relocs(); + auto &aux = *sec.relaxAux; + bool changed = false; + ArrayRef sa = ArrayRef(aux.anchors); + uint64_t delta = 0; + + std::fill_n(aux.relocTypes.get(), relocs.size(), R_LARCH_NONE); + aux.writes.clear(); + for (auto [i, r] : llvm::enumerate(relocs)) { + const uint64_t loc = secAddr + r.offset - delta; + uint32_t &cur = aux.relocDeltas[i], remove = 0; + switch (r.type) { + case R_LARCH_ALIGN: { + const uint64_t addend = + r.sym->isUndefined() ? Log2_64(r.addend) + 1 : r.addend; + const uint64_t allBytes = (1 << (addend & 0xff)) - 4; + const uint64_t align = 1 << (addend & 0xff); + const uint64_t maxBytes = addend >> 8; + const uint64_t off = loc & (align - 1); + const uint64_t curBytes = off == 0 ? 0 : align - off; + // All bytes beyond the alignment boundary should be removed. + // If emit bytes more than max bytes to emit, remove all. + if (maxBytes != 0 && curBytes > maxBytes) + remove = allBytes; + else + remove = allBytes - curBytes; + // If we can't satisfy this alignment, we've found a bad input. + if (LLVM_UNLIKELY(static_cast(remove) < 0)) { + errorOrWarn(getErrorLocation((const uint8_t *)loc) + + "insufficient padding bytes for " + lld::toString(r.type) + + ": " + Twine(allBytes) + " bytes available for " + + "requested alignment of " + Twine(align) + " bytes"); + remove = 0; + } + break; + } + } + + // For all anchors whose offsets are <= r.offset, they are preceded by + // the previous relocation whose `relocDeltas` value equals `delta`. + // Decrease their st_value and update their st_size. + for (; sa.size() && sa[0].offset <= r.offset; sa = sa.slice(1)) { + if (sa[0].end) + sa[0].d->size = sa[0].offset - delta - sa[0].d->value; + else + sa[0].d->value = sa[0].offset - delta; + } + delta += remove; + if (delta != cur) { + cur = delta; + changed = true; + } + } + + for (const SymbolAnchor &a : sa) { + if (a.end) + a.d->size = a.offset - delta - a.d->value; + else + a.d->value = a.offset - delta; + } + // Inform assignAddresses that the size has changed. + if (!isUInt<32>(delta)) + fatal("section size decrease is too large: " + Twine(delta)); + sec.bytesDropped = delta; + return changed; +} + +// When relaxing just R_LARCH_ALIGN, relocDeltas is usually changed only once in +// the absence of a linker script. For call and load/store R_LARCH_RELAX, code +// shrinkage may reduce displacement and make more relocations eligible for +// relaxation. Code shrinkage may increase displacement to a call/load/store +// target at a higher fixed address, invalidating an earlier relaxation. Any +// change in section sizes can have cascading effect and require another +// relaxation pass. +bool LoongArch::relaxOnce(int pass) const { + if (config->relocatable) + return false; + + if (pass == 0) + initSymbolAnchors(); + + SmallVector storage; + bool changed = false; + for (OutputSection *osec : outputSections) { + if (!(osec->flags & SHF_EXECINSTR)) + continue; + for (InputSection *sec : getInputSections(*osec, storage)) + changed |= relax(*sec); + } + return changed; +} + +void LoongArch::finalizeRelax(int passes) const { + log("relaxation passes: " + Twine(passes)); + SmallVector storage; + for (OutputSection *osec : outputSections) { + if (!(osec->flags & SHF_EXECINSTR)) + continue; + for (InputSection *sec : getInputSections(*osec, storage)) { + RelaxAux &aux = *sec->relaxAux; + if (!aux.relocDeltas) + continue; + + MutableArrayRef rels = sec->relocs(); + ArrayRef old = sec->content(); + size_t newSize = old.size() - aux.relocDeltas[rels.size() - 1]; + uint8_t *p = context().bAlloc.Allocate(newSize); + uint64_t offset = 0; + int64_t delta = 0; + sec->content_ = p; + sec->size = newSize; + sec->bytesDropped = 0; + + // Update section content: remove NOPs for R_LARCH_ALIGN and rewrite + // instructions for relaxed relocations. + for (size_t i = 0, e = rels.size(); i != e; ++i) { + uint32_t remove = aux.relocDeltas[i] - delta; + delta = aux.relocDeltas[i]; + if (remove == 0 && aux.relocTypes[i] == R_LARCH_NONE) + continue; + + // Copy from last location to the current relocated location. + const Relocation &r = rels[i]; + uint64_t size = r.offset - offset; + memcpy(p, old.data() + offset, size); + p += size; + offset = r.offset + remove; + } + memcpy(p, old.data() + offset, old.size() - offset); + + // Subtract the previous relocDeltas value from the relocation offset. + // For a pair of R_LARCH_XXX/R_LARCH_RELAX with the same offset, decrease + // their r_offset by the same delta. + delta = 0; + for (size_t i = 0, e = rels.size(); i != e;) { + uint64_t cur = rels[i].offset; + do { + rels[i].offset -= delta; + if (aux.relocTypes[i] != R_LARCH_NONE) + rels[i].type = aux.relocTypes[i]; + } while (++i != e && rels[i].offset == cur); + delta = aux.relocDeltas[i - 1]; + } + } + } +} + TargetInfo *elf::getLoongArchTargetInfo() { static LoongArch target; return ⌖ diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index de52f6a79a40b9..019c073bd541b6 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -26,6 +26,7 @@ using namespace lld::elf; constexpr uint64_t ppc64TocOffset = 0x8000; constexpr uint64_t dynamicThreadPointerOffset = 0x8000; +namespace { // The instruction encoding of bits 21-30 from the ISA for the Xform and Dform // instructions that can be used as part of the initial exec TLS sequence. enum XFormOpcd { @@ -139,6 +140,7 @@ enum class PPCPrefixedInsn : uint64_t { PSTXV = PREFIX_8LS | 0xd8000000, PSTXVP = PREFIX_8LS | 0xf8000000 }; + static bool checkPPCLegacyInsn(uint32_t encoding) { PPCLegacyInsn insn = static_cast(encoding); if (insn == PPCLegacyInsn::NOINSN) @@ -164,7 +166,6 @@ enum class LegacyToPrefixMask : uint64_t { 0x8000000003e00000, // S/T (6-10) - The [S/T]X bit moves from 28 to 5. }; -namespace { class PPC64 final : public TargetInfo { public: PPC64(); diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index d7d3d3e4781497..4798c86f7d1b61 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -45,6 +45,7 @@ class RISCV final : public TargetInfo { uint64_t val) const override; void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; bool relaxOnce(int pass) const override; + void finalizeRelax(int passes) const override; }; } // end anonymous namespace @@ -56,11 +57,13 @@ class RISCV final : public TargetInfo { const uint64_t dtpOffset = 0x800; +namespace { enum Op { ADDI = 0x13, AUIPC = 0x17, JALR = 0x67, LD = 0x3003, + LUI = 0x37, LW = 0x2003, SRLI = 0x5013, SUB = 0x40000033, @@ -73,8 +76,10 @@ enum Reg { X_T0 = 5, X_T1 = 6, X_T2 = 7, + X_A0 = 10, X_T3 = 28, }; +} // namespace static uint32_t hi20(uint32_t val) { return (val + 0x800) >> 12; } static uint32_t lo12(uint32_t val) { return val & 4095; } @@ -119,6 +124,7 @@ RISCV::RISCV() { tlsGotRel = R_RISCV_TLS_TPREL32; } gotRel = symbolicRel; + tlsDescRel = R_RISCV_TLSDESC; // .got[0] = _DYNAMIC gotHeaderEntriesNum = 1; @@ -187,6 +193,8 @@ int64_t RISCV::getImplicitAddend(const uint8_t *buf, RelType type) const { case R_RISCV_JUMP_SLOT: // These relocations are defined as not having an implicit addend. return 0; + case R_RISCV_TLSDESC: + return config->is64 ? read64le(buf + 8) : read32le(buf + 4); } } @@ -295,6 +303,12 @@ RelExpr RISCV::getRelExpr(const RelType type, const Symbol &s, case R_RISCV_PCREL_LO12_I: case R_RISCV_PCREL_LO12_S: return R_RISCV_PC_INDIRECT; + case R_RISCV_TLSDESC_HI20: + case R_RISCV_TLSDESC_LOAD_LO12: + case R_RISCV_TLSDESC_ADD_LO12: + return R_TLSDESC_PC; + case R_RISCV_TLSDESC_CALL: + return R_TLSDESC_CALL; case R_RISCV_TLS_GD_HI20: return R_TLSGD_PC; case R_RISCV_TLS_GOT_HI20: @@ -419,6 +433,7 @@ void RISCV::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { case R_RISCV_GOT_HI20: case R_RISCV_PCREL_HI20: + case R_RISCV_TLSDESC_HI20: case R_RISCV_TLS_GD_HI20: case R_RISCV_TLS_GOT_HI20: case R_RISCV_TPREL_HI20: @@ -430,6 +445,8 @@ void RISCV::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { } case R_RISCV_PCREL_LO12_I: + case R_RISCV_TLSDESC_LOAD_LO12: + case R_RISCV_TLSDESC_ADD_LO12: case R_RISCV_TPREL_LO12_I: case R_RISCV_LO12_I: { uint64_t hi = (val + 0x800) >> 12; @@ -513,32 +530,133 @@ void RISCV::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { break; case R_RISCV_RELAX: - return; // Ignored (for now) - + return; + case R_RISCV_TLSDESC: + // The addend is stored in the second word. + if (config->is64) + write64le(loc + 8, val); + else + write32le(loc + 4, val); + break; default: llvm_unreachable("unknown relocation"); } } +static bool relaxable(ArrayRef relocs, size_t i) { + return i + 1 != relocs.size() && relocs[i + 1].type == R_RISCV_RELAX; +} + +static void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) { + switch (rel.type) { + case R_RISCV_TLSDESC_HI20: + case R_RISCV_TLSDESC_LOAD_LO12: + write32le(loc, 0x00000013); // nop + break; + case R_RISCV_TLSDESC_ADD_LO12: + write32le(loc, utype(AUIPC, X_A0, hi20(val))); // auipc a0, + break; + case R_RISCV_TLSDESC_CALL: + if (config->is64) + write32le(loc, itype(LD, X_A0, X_A0, lo12(val))); // ld a0,(a0) + else + write32le(loc, itype(LW, X_A0, X_A0, lo12(val))); // lw a0,(a0) + break; + default: + llvm_unreachable("unsupported relocation for TLSDESC to IE"); + } +} + +static void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { + switch (rel.type) { + case R_RISCV_TLSDESC_HI20: + case R_RISCV_TLSDESC_LOAD_LO12: + write32le(loc, 0x00000013); // nop + return; + case R_RISCV_TLSDESC_ADD_LO12: + if (isInt<12>(val)) + write32le(loc, 0x00000013); // nop + else + write32le(loc, utype(LUI, X_A0, hi20(val))); // lui a0, + return; + case R_RISCV_TLSDESC_CALL: + if (isInt<12>(val)) + write32le(loc, itype(ADDI, X_A0, 0, val)); // addi a0,zero, + else + write32le(loc, itype(ADDI, X_A0, X_A0, lo12(val))); // addi a0,a0, + return; + default: + llvm_unreachable("unsupported relocation for TLSDESC to LE"); + } +} + void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { uint64_t secAddr = sec.getOutputSection()->addr; if (auto *s = dyn_cast(&sec)) secAddr += s->outSecOff; else if (auto *ehIn = dyn_cast(&sec)) secAddr += ehIn->getParent()->outSecOff; - for (size_t i = 0, size = sec.relocs().size(); i != size; ++i) { - const Relocation &rel = sec.relocs()[i]; + uint64_t tlsdescVal = 0; + bool tlsdescRelax = false, isToLe = false; + const ArrayRef relocs = sec.relocs(); + for (size_t i = 0, size = relocs.size(); i != size; ++i) { + const Relocation &rel = relocs[i]; uint8_t *loc = buf + rel.offset; - const uint64_t val = + uint64_t val = sec.getRelocTargetVA(sec.file, rel.type, rel.addend, secAddr + rel.offset, *rel.sym, rel.expr); switch (rel.expr) { case R_RELAX_HINT: + continue; + case R_TLSDESC_PC: + // For R_RISCV_TLSDESC_HI20, store &got(sym)-PC to be used by the + // following two instructions L[DW] and ADDI. + if (rel.type == R_RISCV_TLSDESC_HI20) + tlsdescVal = val; + else + val = tlsdescVal; break; + case R_RELAX_TLS_GD_TO_IE: + // Only R_RISCV_TLSDESC_HI20 reaches here. tlsdescVal will be finalized + // after we see R_RISCV_TLSDESC_ADD_LO12 in the R_RELAX_TLS_GD_TO_LE case. + // The net effect is that tlsdescVal will be smaller than `val` to take + // into account of NOP instructions (in the absence of R_RISCV_RELAX) + // before AUIPC. + tlsdescVal = val + rel.offset; + isToLe = false; + tlsdescRelax = relaxable(relocs, i); + if (!tlsdescRelax) + tlsdescToIe(loc, rel, val); + continue; + case R_RELAX_TLS_GD_TO_LE: + // See the comment in handleTlsRelocation. For TLSDESC=>IE, + // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12,CALL} also reach here. If isToIe is + // true, this is actually TLSDESC=>IE optimization. + if (rel.type == R_RISCV_TLSDESC_HI20) { + tlsdescVal = val; + isToLe = true; + tlsdescRelax = relaxable(relocs, i); + } else { + if (!isToLe && rel.type == R_RISCV_TLSDESC_ADD_LO12) + tlsdescVal -= rel.offset; + val = tlsdescVal; + } + // When NOP conversion is eligible and relaxation applies, don't write a + // NOP in case an unrelated instruction follows the current instruction. + if (tlsdescRelax && + (rel.type == R_RISCV_TLSDESC_HI20 || + rel.type == R_RISCV_TLSDESC_LOAD_LO12 || + (rel.type == R_RISCV_TLSDESC_ADD_LO12 && isToLe && !hi20(val)))) + continue; + if (isToLe) + tlsdescToLe(loc, rel, val); + else + tlsdescToIe(loc, rel, val); + continue; case R_RISCV_LEB128: if (i + 1 < size) { - const Relocation &rel1 = sec.relocs()[i + 1]; + const Relocation &rel1 = relocs[i + 1]; if (rel.type == R_RISCV_SET_ULEB128 && rel1.type == R_RISCV_SUB_ULEB128 && rel.offset == rel1.offset) { auto val = rel.sym->getVA(rel.addend) - rel1.sym->getVA(rel1.addend); @@ -554,39 +672,19 @@ void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { ": R_RISCV_SET_ULEB128 not paired with R_RISCV_SUB_SET128"); return; default: - relocate(loc, rel, val); break; } + relocate(loc, rel, val); } } -namespace { -struct SymbolAnchor { - uint64_t offset; - Defined *d; - bool end; // true for the anchor of st_value+st_size -}; -} // namespace - -struct elf::RISCVRelaxAux { - // This records symbol start and end offsets which will be adjusted according - // to the nearest relocDeltas element. - SmallVector anchors; - // For relocations[i], the actual offset is r_offset - (i ? relocDeltas[i-1] : - // 0). - std::unique_ptr relocDeltas; - // For relocations[i], the actual type is relocTypes[i]. - std::unique_ptr relocTypes; - SmallVector writes; -}; - -static void initSymbolAnchors() { +void elf::initSymbolAnchors() { SmallVector storage; for (OutputSection *osec : outputSections) { if (!(osec->flags & SHF_EXECINSTR)) continue; for (InputSection *sec : getInputSections(*osec, storage)) { - sec->relaxAux = make(); + sec->relaxAux = make(); if (sec->relocs().size()) { sec->relaxAux->relocDeltas = std::make_unique(sec->relocs().size()); @@ -715,14 +813,16 @@ static void relaxHi20Lo12(const InputSection &sec, size_t i, uint64_t loc, static bool relax(InputSection &sec) { const uint64_t secAddr = sec.getVA(); + const MutableArrayRef relocs = sec.relocs(); auto &aux = *sec.relaxAux; bool changed = false; ArrayRef sa = ArrayRef(aux.anchors); uint64_t delta = 0; + bool tlsdescRelax = false, toLeShortForm = false; - std::fill_n(aux.relocTypes.get(), sec.relocs().size(), R_RISCV_NONE); + std::fill_n(aux.relocTypes.get(), relocs.size(), R_RISCV_NONE); aux.writes.clear(); - for (auto [i, r] : llvm::enumerate(sec.relocs())) { + for (auto [i, r] : llvm::enumerate(relocs)) { const uint64_t loc = secAddr + r.offset - delta; uint32_t &cur = aux.relocDeltas[i], remove = 0; switch (r.type) { @@ -743,25 +843,37 @@ static bool relax(InputSection &sec) { } case R_RISCV_CALL: case R_RISCV_CALL_PLT: - if (i + 1 != sec.relocs().size() && - sec.relocs()[i + 1].type == R_RISCV_RELAX) + if (relaxable(relocs, i)) relaxCall(sec, i, loc, r, remove); break; case R_RISCV_TPREL_HI20: case R_RISCV_TPREL_ADD: case R_RISCV_TPREL_LO12_I: case R_RISCV_TPREL_LO12_S: - if (i + 1 != sec.relocs().size() && - sec.relocs()[i + 1].type == R_RISCV_RELAX) + if (relaxable(relocs, i)) relaxTlsLe(sec, i, loc, r, remove); break; case R_RISCV_HI20: case R_RISCV_LO12_I: case R_RISCV_LO12_S: - if (i + 1 != sec.relocs().size() && - sec.relocs()[i + 1].type == R_RISCV_RELAX) + if (relaxable(relocs, i)) relaxHi20Lo12(sec, i, loc, r, remove); break; + case R_RISCV_TLSDESC_HI20: + // For TLSDESC=>LE, we can use the short form if hi20 is zero. + tlsdescRelax = relaxable(relocs, i); + toLeShortForm = tlsdescRelax && r.expr == R_RELAX_TLS_GD_TO_LE && + !hi20(r.sym->getVA(r.addend)); + [[fallthrough]]; + case R_RISCV_TLSDESC_LOAD_LO12: + // For TLSDESC=>LE/IE, AUIPC and L[DW] are removed if relaxable. + if (tlsdescRelax && r.expr != R_TLSDESC_PC) + remove = 4; + break; + case R_RISCV_TLSDESC_ADD_LO12: + if (toLeShortForm) + remove = 4; + break; } // For all anchors whose offsets are <= r.offset, they are preceded by @@ -819,7 +931,7 @@ bool RISCV::relaxOnce(int pass) const { return changed; } -void elf::riscvFinalizeRelax(int passes) { +void RISCV::finalizeRelax(int passes) const { llvm::TimeTraceScope timeScope("Finalize RISC-V relaxation"); log("relaxation passes: " + Twine(passes)); SmallVector storage; @@ -827,7 +939,7 @@ void elf::riscvFinalizeRelax(int passes) { if (!(osec->flags & SHF_EXECINSTR)) continue; for (InputSection *sec : getInputSections(*osec, storage)) { - RISCVRelaxAux &aux = *sec->relaxAux; + RelaxAux &aux = *sec->relaxAux; if (!aux.relocDeltas) continue; diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp new file mode 100644 index 00000000000000..d37db6877559dc --- /dev/null +++ b/lld/ELF/Arch/SystemZ.cpp @@ -0,0 +1,607 @@ +//===- SystemZ.cpp --------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OutputSections.h" +#include "Symbols.h" +#include "SyntheticSections.h" +#include "Target.h" +#include "lld/Common/ErrorHandler.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/Support/Endian.h" + +using namespace llvm; +using namespace llvm::support::endian; +using namespace llvm::ELF; +using namespace lld; +using namespace lld::elf; + +namespace { +class SystemZ : public TargetInfo { +public: + SystemZ(); + int getTlsGdRelaxSkip(RelType type) const override; + RelExpr getRelExpr(RelType type, const Symbol &s, + const uint8_t *loc) const override; + RelType getDynRel(RelType type) const override; + void writeGotHeader(uint8_t *buf) const override; + void writeGotPlt(uint8_t *buf, const Symbol &s) const override; + void writeIgotPlt(uint8_t *buf, const Symbol &s) const override; + void writePltHeader(uint8_t *buf) const override; + void addPltHeaderSymbols(InputSection &isd) const override; + void writePlt(uint8_t *buf, const Symbol &sym, + uint64_t pltEntryAddr) const override; + RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override; + RelExpr adjustGotPcExpr(RelType type, int64_t addend, + const uint8_t *loc) const override; + bool relaxOnce(int pass) const override; + void relocate(uint8_t *loc, const Relocation &rel, + uint64_t val) const override; + int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override; + +private: + void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; +}; +} // namespace + +SystemZ::SystemZ() { + copyRel = R_390_COPY; + gotRel = R_390_GLOB_DAT; + pltRel = R_390_JMP_SLOT; + relativeRel = R_390_RELATIVE; + iRelativeRel = R_390_IRELATIVE; + symbolicRel = R_390_64; + tlsGotRel = R_390_TLS_TPOFF; + tlsModuleIndexRel = R_390_TLS_DTPMOD; + tlsOffsetRel = R_390_TLS_DTPOFF; + gotHeaderEntriesNum = 3; + gotPltHeaderEntriesNum = 0; + gotEntrySize = 8; + pltHeaderSize = 32; + pltEntrySize = 32; + ipltEntrySize = 32; + + // This "trap instruction" is used to fill gaps between sections. + // On SystemZ, the behavior of the GNU ld is to fill those gaps + // with nop instructions instead - and unfortunately the default + // glibc crt object files (used to) rely on that behavior since + // they use an alignment on the .init section fragments that causes + // gaps which must be filled with nops as they are being executed. + // Therefore, we provide a nop instruction as "trapInstr" here. + trapInstr = {0x07, 0x07, 0x07, 0x07}; + + defaultImageBase = 0x1000000; +} + +RelExpr SystemZ::getRelExpr(RelType type, const Symbol &s, + const uint8_t *loc) const { + switch (type) { + case R_390_NONE: + return R_NONE; + // Relocations targeting the symbol value. + case R_390_8: + case R_390_12: + case R_390_16: + case R_390_20: + case R_390_32: + case R_390_64: + return R_ABS; + case R_390_PC16: + case R_390_PC32: + case R_390_PC64: + case R_390_PC12DBL: + case R_390_PC16DBL: + case R_390_PC24DBL: + case R_390_PC32DBL: + return R_PC; + case R_390_GOTOFF16: + case R_390_GOTOFF: // a.k.a. R_390_GOTOFF32 + case R_390_GOTOFF64: + return R_GOTREL; + // Relocations targeting the PLT associated with the symbol. + case R_390_PLT32: + case R_390_PLT64: + case R_390_PLT12DBL: + case R_390_PLT16DBL: + case R_390_PLT24DBL: + case R_390_PLT32DBL: + return R_PLT_PC; + case R_390_PLTOFF16: + case R_390_PLTOFF32: + case R_390_PLTOFF64: + return R_PLT_GOTREL; + // Relocations targeting the GOT entry associated with the symbol. + case R_390_GOTENT: + return R_GOT_PC; + case R_390_GOT12: + case R_390_GOT16: + case R_390_GOT20: + case R_390_GOT32: + case R_390_GOT64: + return R_GOT_OFF; + // Relocations targeting the GOTPLT entry associated with the symbol. + case R_390_GOTPLTENT: + return R_GOTPLT_PC; + case R_390_GOTPLT12: + case R_390_GOTPLT16: + case R_390_GOTPLT20: + case R_390_GOTPLT32: + case R_390_GOTPLT64: + return R_GOTPLT_GOTREL; + // Relocations targeting _GLOBAL_OFFSET_TABLE_. + case R_390_GOTPC: + case R_390_GOTPCDBL: + return R_GOTONLY_PC; + // TLS-related relocations. + case R_390_TLS_LOAD: + return R_NONE; + case R_390_TLS_GDCALL: + return R_TLSGD_PC; + case R_390_TLS_LDCALL: + return R_TLSLD_PC; + case R_390_TLS_GD32: + case R_390_TLS_GD64: + return R_TLSGD_GOT; + case R_390_TLS_LDM32: + case R_390_TLS_LDM64: + return R_TLSLD_GOT; + case R_390_TLS_LDO32: + case R_390_TLS_LDO64: + return R_DTPREL; + case R_390_TLS_LE32: + case R_390_TLS_LE64: + return R_TPREL; + case R_390_TLS_IE32: + case R_390_TLS_IE64: + return R_GOT; + case R_390_TLS_GOTIE12: + case R_390_TLS_GOTIE20: + case R_390_TLS_GOTIE32: + case R_390_TLS_GOTIE64: + return R_GOT_OFF; + case R_390_TLS_IEENT: + return R_GOT_PC; + + default: + error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) + + ") against symbol " + toString(s)); + return R_NONE; + } +} + +void SystemZ::writeGotHeader(uint8_t *buf) const { + // _GLOBAL_OFFSET_TABLE_[0] holds the value of _DYNAMIC. + // _GLOBAL_OFFSET_TABLE_[1] and [2] are reserved. + write64be(buf, mainPart->dynamic->getVA()); +} + +void SystemZ::writeGotPlt(uint8_t *buf, const Symbol &s) const { + write64be(buf, s.getPltVA() + 14); +} + +void SystemZ::writeIgotPlt(uint8_t *buf, const Symbol &s) const { + if (config->writeAddends) + write64be(buf, s.getVA()); +} + +void SystemZ::writePltHeader(uint8_t *buf) const { + const uint8_t pltData[] = { + 0xe3, 0x10, 0xf0, 0x38, 0x00, 0x24, // stg %r1,56(%r15) + 0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl %r1,_GLOBAL_OFFSET_TABLE_ + 0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc 48(8,%r15),8(%r1) + 0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg %r1,16(%r1) + 0x07, 0xf1, // br %r1 + 0x07, 0x00, // nopr + 0x07, 0x00, // nopr + 0x07, 0x00, // nopr + }; + memcpy(buf, pltData, sizeof(pltData)); + uint64_t got = in.got->getVA(); + uint64_t plt = in.plt->getVA(); + write32be(buf + 8, (got - plt - 6) >> 1); +} + +void SystemZ::addPltHeaderSymbols(InputSection &isec) const { + // The PLT header needs a reference to _GLOBAL_OFFSET_TABLE_, so we + // must ensure the .got section is created even if otherwise unused. + in.got->hasGotOffRel.store(true, std::memory_order_relaxed); +} + +void SystemZ::writePlt(uint8_t *buf, const Symbol &sym, + uint64_t pltEntryAddr) const { + const uint8_t inst[] = { + 0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl %r1,<.got.plt slot> + 0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg %r1,0(%r1) + 0x07, 0xf1, // br %r1 + 0x0d, 0x10, // basr %r1,%r0 + 0xe3, 0x10, 0x10, 0x0c, 0x00, 0x14, // lgf %r1,12(%r1) + 0xc0, 0xf4, 0x00, 0x00, 0x00, 0x00, // jg + 0x00, 0x00, 0x00, 0x00, // + }; + memcpy(buf, inst, sizeof(inst)); + + write32be(buf + 2, (sym.getGotPltVA() - pltEntryAddr) >> 1); + write32be(buf + 24, (in.plt->getVA() - pltEntryAddr - 22) >> 1); + write32be(buf + 28, in.relaPlt->entsize * sym.getPltIdx()); +} + +int64_t SystemZ::getImplicitAddend(const uint8_t *buf, RelType type) const { + switch (type) { + case R_390_8: + return SignExtend64<8>(*buf); + case R_390_16: + case R_390_PC16: + return SignExtend64<16>(read16be(buf)); + case R_390_PC16DBL: + return SignExtend64<16>(read16be(buf)) << 1; + case R_390_32: + case R_390_PC32: + return SignExtend64<32>(read32be(buf)); + case R_390_PC32DBL: + return SignExtend64<32>(read32be(buf)) << 1; + case R_390_64: + case R_390_PC64: + case R_390_TLS_DTPMOD: + case R_390_TLS_DTPOFF: + case R_390_TLS_TPOFF: + case R_390_GLOB_DAT: + case R_390_RELATIVE: + case R_390_IRELATIVE: + return read64be(buf); + case R_390_COPY: + case R_390_JMP_SLOT: + case R_390_NONE: + // These relocations are defined as not having an implicit addend. + return 0; + default: + internalLinkerError(getErrorLocation(buf), + "cannot read addend for relocation " + toString(type)); + return 0; + } +} + +RelType SystemZ::getDynRel(RelType type) const { + if (type == R_390_64 || type == R_390_PC64) + return type; + return R_390_NONE; +} + +RelExpr SystemZ::adjustTlsExpr(RelType type, RelExpr expr) const { + if (expr == R_RELAX_TLS_GD_TO_IE) + return R_RELAX_TLS_GD_TO_IE_GOT_OFF; + return expr; +} + +int SystemZ::getTlsGdRelaxSkip(RelType type) const { + // A __tls_get_offset call instruction is marked with 2 relocations: + // + // R_390_TLS_GDCALL / R_390_TLS_LDCALL: marker relocation + // R_390_PLT32DBL: __tls_get_offset + // + // After the relaxation we no longer call __tls_get_offset and should skip + // both relocations to not create a false dependence on __tls_get_offset + // being defined. + // + // Note that this mechanism only works correctly if the R_390_TLS_[GL]DCALL + // is seen immediately *before* the R_390_PLT32DBL. Unfortunately, current + // compilers on the platform will typically generate the inverse sequence. + // To fix this, we sort relocations by offset in RelocationScanner::scan; + // this ensures the correct sequence as the R_390_TLS_[GL]DCALL applies to + // the first byte of the brasl instruction, while the R_390_PLT32DBL applies + // to its third byte (the relative displacement). + + if (type == R_390_TLS_GDCALL || type == R_390_TLS_LDCALL) + return 2; + return 1; +} + +void SystemZ::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { + // The general-dynamic code sequence for a global `x`: + // + // Instruction Relocation Symbol + // ear %rX,%a0 + // sllg %rX,%rX,32 + // ear %rX,%a1 + // larl %r12,_GLOBAL_OFFSET_TABLE_ R_390_GOTPCDBL _GLOBAL_OFFSET_TABLE_ + // lgrl %r2,.LC0 R_390_PC32DBL .LC0 + // brasl %r14,__tls_get_offset@plt R_390_TLS_GDCALL x + // :tls_gdcall:x R_390_PLT32DBL __tls_get_offset + // la %r2,0(%r2,%rX) + // + // .LC0: + // .quad x@TLSGD R_390_TLS_GD64 x + // + // Relaxing to initial-exec entails: + // 1) Replacing the call by a load from the GOT. + // 2) Replacing the relocation on the constant LC0 by R_390_TLS_GOTIE64. + + switch (rel.type) { + case R_390_TLS_GDCALL: + // brasl %r14,__tls_get_offset@plt -> lg %r2,0(%r2,%r12) + write16be(loc, 0xe322); + write32be(loc + 2, 0xc0000004); + break; + case R_390_TLS_GD64: + relocateNoSym(loc, R_390_TLS_GOTIE64, val); + break; + default: + llvm_unreachable("unsupported relocation for TLS GD to IE relaxation"); + } +} + +void SystemZ::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { + // The general-dynamic code sequence for a global `x`: + // + // Instruction Relocation Symbol + // ear %rX,%a0 + // sllg %rX,%rX,32 + // ear %rX,%a1 + // larl %r12,_GLOBAL_OFFSET_TABLE_ R_390_GOTPCDBL _GLOBAL_OFFSET_TABLE_ + // lgrl %r2,.LC0 R_390_PC32DBL .LC0 + // brasl %r14,__tls_get_offset@plt R_390_TLS_GDCALL x + // :tls_gdcall:x R_390_PLT32DBL __tls_get_offset + // la %r2,0(%r2,%rX) + // + // .LC0: + // .quad x@tlsgd R_390_TLS_GD64 x + // + // Relaxing to local-exec entails: + // 1) Replacing the call by a nop. + // 2) Replacing the relocation on the constant LC0 by R_390_TLS_LE64. + + switch (rel.type) { + case R_390_TLS_GDCALL: + // brasl %r14,__tls_get_offset@plt -> brcl 0,. + write16be(loc, 0xc004); + write32be(loc + 2, 0x00000000); + break; + case R_390_TLS_GD64: + relocateNoSym(loc, R_390_TLS_LE64, val); + break; + default: + llvm_unreachable("unsupported relocation for TLS GD to LE relaxation"); + } +} + +void SystemZ::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { + // The local-dynamic code sequence for a global `x`: + // + // Instruction Relocation Symbol + // ear %rX,%a0 + // sllg %rX,%rX,32 + // ear %rX,%a1 + // larl %r12,_GLOBAL_OFFSET_TABLE_ R_390_GOTPCDBL _GLOBAL_OFFSET_TABLE_ + // lgrl %r2,.LC0 R_390_PC32DBL .LC0 + // brasl %r14,__tls_get_offset@plt R_390_TLS_LDCALL + // :tls_ldcall: R_390_PLT32DBL __tls_get_offset + // la %r2,0(%r2,%rX) + // lgrl %rY,.LC1 R_390_PC32DBL .LC1 + // la %r2,0(%r2,%rY) + // + // .LC0: + // .quad @tlsldm R_390_TLS_LDM64 + // .LC1: + // .quad x@dtpoff R_390_TLS_LDO64 x + // + // Relaxing to local-exec entails: + // 1) Replacing the call by a nop. + // 2) Replacing the constant LC0 by 0 (i.e. ignoring the relocation). + // 3) Replacing the relocation on the constant LC1 by R_390_TLS_LE64. + + switch (rel.type) { + case R_390_TLS_LDCALL: + // brasl %r14,__tls_get_offset@plt -> brcl 0,. + write16be(loc, 0xc004); + write32be(loc + 2, 0x00000000); + break; + case R_390_TLS_LDM64: + break; + case R_390_TLS_LDO64: + relocateNoSym(loc, R_390_TLS_LE64, val); + break; + default: + llvm_unreachable("unsupported relocation for TLS LD to LE relaxation"); + } +} + +RelExpr SystemZ::adjustGotPcExpr(RelType type, int64_t addend, + const uint8_t *loc) const { + // Only R_390_GOTENT with addend 2 can be relaxed. + if (!config->relax || addend != 2 || type != R_390_GOTENT) + return R_GOT_PC; + const uint16_t op = read16be(loc - 2); + + // lgrl rx,sym@GOTENT -> larl rx, sym + // This relaxation is legal if "sym" binds locally (which was already + // verified by our caller) and is in-range and properly aligned for a + // LARL instruction. We cannot verify the latter constraint here, so + // we assume it is true and revert the decision later on in relaxOnce + // if necessary. + if ((op & 0xff0f) == 0xc408) + return R_RELAX_GOT_PC; + + return R_GOT_PC; +} + +bool SystemZ::relaxOnce(int pass) const { + // If we decided in adjustGotPcExpr to relax a R_390_GOTENT, + // we need to validate the target symbol is in-range and aligned. + SmallVector storage; + bool changed = false; + for (OutputSection *osec : outputSections) { + if (!(osec->flags & SHF_EXECINSTR)) + continue; + for (InputSection *sec : getInputSections(*osec, storage)) { + for (Relocation &rel : sec->relocs()) { + if (rel.expr != R_RELAX_GOT_PC) + continue; + + uint64_t v = sec->getRelocTargetVA( + sec->file, rel.type, rel.addend, + sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr); + if (isInt<33>(v) && !(v & 1)) + continue; + if (rel.sym->auxIdx == 0) { + rel.sym->allocateAux(); + addGotEntry(*rel.sym); + changed = true; + } + rel.expr = R_GOT_PC; + } + } + } + return changed; +} + +void SystemZ::relaxGot(uint8_t *loc, const Relocation &rel, + uint64_t val) const { + assert(isInt<33>(val) && + "R_390_GOTENT should not have been relaxed if it overflows"); + assert(!(val & 1) && + "R_390_GOTENT should not have been relaxed if it is misaligned"); + const uint16_t op = read16be(loc - 2); + + // lgrl rx,sym@GOTENT -> larl rx, sym + if ((op & 0xff0f) == 0xc408) { + write16be(loc - 2, 0xc000 | (op & 0x00f0)); + write32be(loc, val >> 1); + } +} + +void SystemZ::relocate(uint8_t *loc, const Relocation &rel, + uint64_t val) const { + switch (rel.expr) { + case R_RELAX_GOT_PC: + return relaxGot(loc, rel, val); + case R_RELAX_TLS_GD_TO_IE_GOT_OFF: + return relaxTlsGdToIe(loc, rel, val); + case R_RELAX_TLS_GD_TO_LE: + return relaxTlsGdToLe(loc, rel, val); + case R_RELAX_TLS_LD_TO_LE: + return relaxTlsLdToLe(loc, rel, val); + default: + break; + } + switch (rel.type) { + case R_390_8: + checkIntUInt(loc, val, 8, rel); + *loc = val; + break; + case R_390_12: + case R_390_GOT12: + case R_390_GOTPLT12: + case R_390_TLS_GOTIE12: + checkUInt(loc, val, 12, rel); + write16be(loc, (read16be(loc) & 0xF000) | val); + break; + case R_390_PC12DBL: + case R_390_PLT12DBL: + checkInt(loc, val, 13, rel); + checkAlignment(loc, val, 2, rel); + write16be(loc, (read16be(loc) & 0xF000) | ((val >> 1) & 0x0FFF)); + break; + case R_390_16: + case R_390_GOT16: + case R_390_GOTPLT16: + case R_390_GOTOFF16: + case R_390_PLTOFF16: + checkIntUInt(loc, val, 16, rel); + write16be(loc, val); + break; + case R_390_PC16: + checkInt(loc, val, 16, rel); + write16be(loc, val); + break; + case R_390_PC16DBL: + case R_390_PLT16DBL: + checkInt(loc, val, 17, rel); + checkAlignment(loc, val, 2, rel); + write16be(loc, val >> 1); + break; + case R_390_20: + case R_390_GOT20: + case R_390_GOTPLT20: + case R_390_TLS_GOTIE20: + checkInt(loc, val, 20, rel); + write32be(loc, (read32be(loc) & 0xF00000FF) | ((val & 0xFFF) << 16) | + ((val & 0xFF000) >> 4)); + break; + case R_390_PC24DBL: + case R_390_PLT24DBL: + checkInt(loc, val, 25, rel); + checkAlignment(loc, val, 2, rel); + loc[0] = val >> 17; + loc[1] = val >> 9; + loc[2] = val >> 1; + break; + case R_390_32: + case R_390_GOT32: + case R_390_GOTPLT32: + case R_390_GOTOFF: + case R_390_PLTOFF32: + case R_390_TLS_IE32: + case R_390_TLS_GOTIE32: + case R_390_TLS_GD32: + case R_390_TLS_LDM32: + case R_390_TLS_LDO32: + case R_390_TLS_LE32: + checkIntUInt(loc, val, 32, rel); + write32be(loc, val); + break; + case R_390_PC32: + case R_390_PLT32: + checkInt(loc, val, 32, rel); + write32be(loc, val); + break; + case R_390_PC32DBL: + case R_390_PLT32DBL: + case R_390_GOTPCDBL: + case R_390_GOTENT: + case R_390_GOTPLTENT: + case R_390_TLS_IEENT: + checkInt(loc, val, 33, rel); + checkAlignment(loc, val, 2, rel); + write32be(loc, val >> 1); + break; + case R_390_64: + case R_390_PC64: + case R_390_PLT64: + case R_390_GOT64: + case R_390_GOTPLT64: + case R_390_GOTOFF64: + case R_390_PLTOFF64: + case R_390_GOTPC: + case R_390_TLS_IE64: + case R_390_TLS_GOTIE64: + case R_390_TLS_GD64: + case R_390_TLS_LDM64: + case R_390_TLS_LDO64: + case R_390_TLS_LE64: + case R_390_TLS_DTPMOD: + case R_390_TLS_DTPOFF: + case R_390_TLS_TPOFF: + write64be(loc, val); + break; + case R_390_TLS_LOAD: + case R_390_TLS_GDCALL: + case R_390_TLS_LDCALL: + break; + default: + llvm_unreachable("unknown relocation"); + } +} + +TargetInfo *elf::getSystemZTargetInfo() { + static SystemZ t; + return &t; +} diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp index de459013595fed..a85bf3aa0c09d1 100644 --- a/lld/ELF/Arch/X86_64.cpp +++ b/lld/ELF/Arch/X86_64.cpp @@ -328,9 +328,10 @@ bool X86_64::relaxOnce(int pass) const { if (rel.expr != R_RELAX_GOT_PC) continue; - uint64_t v = sec->getRelocTargetVA( - sec->file, rel.type, rel.addend, - sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr); + uint64_t v = sec->getRelocTargetVA(sec->file, rel.type, rel.addend, + sec->getOutputSection()->addr + + sec->outSecOff + rel.offset, + *rel.sym, rel.expr); if (isInt<32>(v)) continue; if (rel.sym->auxIdx == 0) { diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt index 475f7dea1dd7e9..83d816ddb0601e 100644 --- a/lld/ELF/CMakeLists.txt +++ b/lld/ELF/CMakeLists.txt @@ -33,6 +33,7 @@ add_lld_library(lldELF Arch/PPC64.cpp Arch/RISCV.cpp Arch/SPARCV9.cpp + Arch/SystemZ.cpp Arch/X86.cpp Arch/X86_64.cpp ARMErrataFix.cpp diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index f4b7d1c9d5b973..8b2c32b1534821 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -200,6 +200,7 @@ static std::tuple parseEmulation(StringRef emul) { .Case("msp430elf", {ELF32LEKind, EM_MSP430}) .Case("elf64_amdgpu", {ELF64LEKind, EM_AMDGPU}) .Case("elf64loongarch", {ELF64LEKind, EM_LOONGARCH}) + .Case("elf64_s390", {ELF64BEKind, EM_S390}) .Default({ELFNoneKind, EM_NONE}); if (ret.first == ELFNoneKind) @@ -1137,7 +1138,7 @@ static SmallVector getSymbolOrderingFile(MemoryBufferRef mb) { static bool getIsRela(opt::InputArgList &args) { // The psABI specifies the default relocation entry format. bool rela = is_contained({EM_AARCH64, EM_AMDGPU, EM_HEXAGON, EM_LOONGARCH, - EM_PPC, EM_PPC64, EM_RISCV, EM_X86_64}, + EM_PPC, EM_PPC64, EM_RISCV, EM_S390, EM_X86_64}, config->emachine); // If -z rel or -z rela is specified, use the last option. for (auto *arg : args.filtered(OPT_z)) { diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 75e5ee1d0da4f5..6c7ef27cbd4942 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -1614,6 +1614,8 @@ static uint16_t getBitcodeMachineKind(StringRef path, const Triple &t) { return EM_RISCV; case Triple::sparcv9: return EM_SPARCV9; + case Triple::systemz: + return EM_S390; case Triple::x86: return t.isOSIAMCU() ? EM_IAMCU : EM_386; case Triple::x86_64: @@ -1788,7 +1790,12 @@ void BinaryFile::parse() { } InputFile *elf::createInternalFile(StringRef name) { - return make(InputFile::InternalKind, MemoryBufferRef("", name)); + auto *file = + make(InputFile::InternalKind, MemoryBufferRef("", name)); + // References from an internal file do not lead to --warn-backrefs + // diagnostics. + file->groupId = 0; + return file; } ELFFileBase *elf::createObjFile(MemoryBufferRef mb, StringRef archiveName, diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index c728dd6c6306aa..e033a715b59214 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -354,9 +354,10 @@ InputSectionBase *InputSection::getRelocatedSection() const { template void InputSection::copyRelocations(uint8_t *buf) { - if (config->relax && !config->relocatable && config->emachine == EM_RISCV) { - // On RISC-V, relaxation might change relocations: copy from internal ones - // that are updated by relaxation. + if (config->relax && !config->relocatable && + (config->emachine == EM_RISCV || config->emachine == EM_LOONGARCH)) { + // On LoongArch and RISC-V, relaxation might change relocations: copy + // from internal ones that are updated by relaxation. InputSectionBase *sec = getRelocatedSection(); copyRelocations(buf, llvm::make_range(sec->relocations.begin(), sec->relocations.end())); @@ -654,6 +655,7 @@ static int64_t getTlsTpOffset(const Symbol &s) { // Variant 2. case EM_HEXAGON: + case EM_S390: case EM_SPARCV9: case EM_386: case EM_X86_64: @@ -716,6 +718,10 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type, case R_GOT_PC: case R_RELAX_TLS_GD_TO_IE: return sym.getGotVA() + a - p; + case R_GOTPLT_GOTREL: + return sym.getGotPltVA() + a - in.got->getVA(); + case R_GOTPLT_PC: + return sym.getGotPltVA() + a - p; case R_LOONGARCH_GOT_PAGE_PC: if (sym.hasFlag(NEEDS_TLSGD)) return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type); @@ -807,6 +813,8 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type, return getLoongArchPageDelta(sym.getPltVA() + a, p, type); case R_PLT_GOTPLT: return sym.getPltVA() + a - in.gotPlt->getVA(); + case R_PLT_GOTREL: + return sym.getPltVA() + a - in.got->getVA(); case R_PPC32_PLTREL: // R_PPC_PLTREL24 uses the addend (usually 0 or 0x8000) to indicate r30 // stores _GLOBAL_OFFSET_TABLE_ or .got2+0x8000. The addend is ignored for @@ -961,12 +969,11 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef rels) { // vector. The computed value is st_value plus a non-negative offset. // Negative values are invalid, so -1 can be used as the tombstone value. // - // If the referenced symbol is discarded (made Undefined), or the - // section defining the referenced symbol is garbage collected, - // sym.getOutputSection() is nullptr. `ds->folded` catches the ICF folded - // case. However, resolving a relocation in .debug_line to -1 would stop - // debugger users from setting breakpoints on the folded-in function, so - // exclude .debug_line. + // If the referenced symbol is relative to a discarded section (due to + // --gc-sections, COMDAT, etc), it has been converted to a Undefined. + // `ds->folded` catches the ICF folded case. However, resolving a + // relocation in .debug_line to -1 would stop debugger users from setting + // breakpoints on the folded-in function, so exclude .debug_line. // // For pre-DWARF-v5 .debug_loc and .debug_ranges, -1 is a reserved value // (base address selection entry), use 1 (which is used by GNU ld for @@ -974,7 +981,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef rels) { // // TODO To reduce disruption, we use 0 instead of -1 as the tombstone // value. Enable -1 in a future release. - if (!sym.getOutputSection() || (ds && ds->folded && !isDebugLine)) { + if (!ds || (ds->folded && !isDebugLine)) { // If -z dead-reloc-in-nonalloc= is specified, respect it. uint64_t value = SignExtend64(*tombstone); // For a 32-bit local TU reference in .debug_names, X86_64::relocate diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index dda4242d8be1c1..243b28d90bb4c1 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -102,7 +102,23 @@ class SectionBase { link(link), info(info) {} }; -struct RISCVRelaxAux; +struct SymbolAnchor { + uint64_t offset; + Defined *d; + bool end; // true for the anchor of st_value+st_size +}; + +struct RelaxAux { + // This records symbol start and end offsets which will be adjusted according + // to the nearest relocDeltas element. + SmallVector anchors; + // For relocations[i], the actual offset is + // r_offset - (i ? relocDeltas[i-1] : 0). + std::unique_ptr relocDeltas; + // For relocations[i], the actual type is relocTypes[i]. + std::unique_ptr relocTypes; + SmallVector writes; +}; // This corresponds to a section of an input file. class InputSectionBase : public SectionBase { @@ -226,9 +242,9 @@ class InputSectionBase : public SectionBase { // basic blocks. JumpInstrMod *jumpInstrMod = nullptr; - // Auxiliary information for RISC-V linker relaxation. RISC-V does not use - // jumpInstrMod. - RISCVRelaxAux *relaxAux; + // Auxiliary information for RISC-V and LoongArch linker relaxation. + // They do not use jumpInstrMod. + RelaxAux *relaxAux; // The compressed content size when `compressed` is true. size_t compressedSize; diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index b6a317bc3b6d69..619fbaf5dc5452 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -203,8 +203,9 @@ static bool isAbsoluteValue(const Symbol &sym) { // Returns true if Expr refers a PLT entry. static bool needsPlt(RelExpr expr) { - return oneof(expr); + return oneof(expr); } bool lld::elf::needsGot(RelExpr expr) { @@ -233,6 +234,8 @@ static RelExpr toPlt(RelExpr expr) { return R_PLT_PC; case R_ABS: return R_PLT; + case R_GOTREL: + return R_PLT_GOTREL; default: return expr; } @@ -253,6 +256,8 @@ static RelExpr fromPlt(RelExpr expr) { return R_ABS; case R_PLT_GOTPLT: return R_GOTPLTREL; + case R_PLT_GOTREL: + return R_GOTREL; default: return expr; } @@ -935,7 +940,7 @@ void elf::addGotEntry(Symbol &sym) { static void addTpOffsetGotEntry(Symbol &sym) { in.got->addEntry(sym); uint64_t off = sym.getGotOffset(); - if (!sym.isPreemptible && !config->isPic) { + if (!sym.isPreemptible && !config->shared) { in.got->addConstant({R_TPREL, target->symbolicRel, off, 0, &sym}); return; } @@ -979,10 +984,10 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type, if (oneof( - e)) + R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL, R_GOTPLT_PC, + R_PPC32_PLTREL, R_PPC64_CALL_PLT, R_PPC64_RELAX_TOC, R_RISCV_ADD, + R_AARCH64_GOT_PAGE, R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT, + R_LOONGARCH_GOT_PAGE_PC>(e)) return true; // These never do, except if the entire file is position dependent or if @@ -1274,29 +1279,34 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, if (config->emachine == EM_MIPS) return handleMipsTlsRelocation(type, sym, c, offset, addend, expr); + bool isRISCV = config->emachine == EM_RISCV; if (oneof(expr) && config->shared) { + // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a label. Do not + // set NEEDS_TLSDESC on the label. if (expr != R_TLSDESC_CALL) { - sym.setFlags(NEEDS_TLSDESC); + if (!isRISCV || type == R_RISCV_TLSDESC_HI20) + sym.setFlags(NEEDS_TLSDESC); c.addReloc({expr, type, offset, addend, &sym}); } return 1; } // ARM, Hexagon, LoongArch and RISC-V do not support GD/LD to IE/LE - // relaxation. + // optimizations. + // RISC-V supports TLSDESC to IE/LE optimizations. // For PPC64, if the file has missing R_PPC64_TLSGD/R_PPC64_TLSLD, disable - // relaxation as well. - bool toExecRelax = !config->shared && config->emachine != EM_ARM && - config->emachine != EM_HEXAGON && - config->emachine != EM_LOONGARCH && - config->emachine != EM_RISCV && - !c.file->ppc64DisableTLSRelax; + // optimization as well. + bool execOptimize = + !config->shared && config->emachine != EM_ARM && + config->emachine != EM_HEXAGON && config->emachine != EM_LOONGARCH && + !(isRISCV && expr != R_TLSDESC_PC && expr != R_TLSDESC_CALL) && + !c.file->ppc64DisableTLSRelax; // If we are producing an executable and the symbol is non-preemptable, it - // must be defined and the code sequence can be relaxed to use Local-Exec. + // must be defined and the code sequence can be optimized to use Local-Exec. // // ARM and RISC-V do not support any relaxations for TLS relocations, however, // we can omit the DTPMOD dynamic relocations and resolve them at link time @@ -1309,8 +1319,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, // module index, with a special value of 0 for the current module. GOT[e1] is // unused. There only needs to be one module index entry. if (oneof(expr)) { - // Local-Dynamic relocs can be relaxed to Local-Exec. - if (toExecRelax) { + // Local-Dynamic relocs can be optimized to Local-Exec. + if (execOptimize) { c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), type, offset, addend, &sym}); return target->getTlsGdRelaxSkip(type); @@ -1322,16 +1332,17 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, return 1; } - // Local-Dynamic relocs can be relaxed to Local-Exec. + // Local-Dynamic relocs can be optimized to Local-Exec. if (expr == R_DTPREL) { - if (toExecRelax) + if (execOptimize) expr = target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE); c.addReloc({expr, type, offset, addend, &sym}); return 1; } // Local-Dynamic sequence where offset of tls variable relative to dynamic - // thread pointer is stored in the got. This cannot be relaxed to Local-Exec. + // thread pointer is stored in the got. This cannot be optimized to + // Local-Exec. if (expr == R_TLSLD_GOT_OFF) { sym.setFlags(NEEDS_GOT_DTPREL); c.addReloc({expr, type, offset, addend, &sym}); @@ -1341,14 +1352,18 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, if (oneof(expr)) { - if (!toExecRelax) { + if (!execOptimize) { sym.setFlags(NEEDS_TLSGD); c.addReloc({expr, type, offset, addend, &sym}); return 1; } - // Global-Dynamic relocs can be relaxed to Initial-Exec or Local-Exec + // Global-Dynamic/TLSDESC can be optimized to Initial-Exec or Local-Exec // depending on the symbol being locally defined or not. + // + // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a non-preemptible + // label, so the LE optimization will be categorized as + // R_RELAX_TLS_GD_TO_LE. We fix the categorization in RISCV::relocateAlloc. if (sym.isPreemptible) { sym.setFlags(NEEDS_TLSGD_TO_IE); c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), type, @@ -1363,9 +1378,9 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, if (oneof(expr)) { ctx.hasTlsIe.store(true, std::memory_order_relaxed); - // Initial-Exec relocs can be relaxed to Local-Exec if the symbol is locally - // defined. - if (toExecRelax && isLocalInExecutable) { + // Initial-Exec relocs can be optimized to Local-Exec if the symbol is + // locally defined. This is not supported on SystemZ. + if (execOptimize && isLocalInExecutable && config->emachine != EM_S390) { c.addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym}); } else if (expr != R_TLSIE_HINT) { sym.setFlags(NEEDS_TLSIE); @@ -1463,7 +1478,7 @@ template void RelocationScanner::scanOne(RelTy *&i) { in.got->hasGotOffRel.store(true, std::memory_order_relaxed); } - // Process TLS relocations, including relaxing TLS relocations. Note that + // Process TLS relocations, including TLS optimizations. Note that // R_TPREL and R_TPREL_NEG relocations are resolved in processAux. if (sym.isTls()) { if (unsigned processed = @@ -1524,8 +1539,10 @@ void RelocationScanner::scan(ArrayRef rels) { // For EhInputSection, OffsetGetter expects the relocations to be sorted by // r_offset. In rare cases (.eh_frame pieces are reordered by a linker // script), the relocations may be unordered. + // On SystemZ, all sections need to be sorted by r_offset, to allow TLS + // relaxation to be handled correctly - see SystemZ::getTlsGdRelaxSkip. SmallVector storage; - if (isa(sec)) + if (isa(sec) || config->emachine == EM_S390) rels = sortRels(rels, storage); end = static_cast(rels.end()); diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h index cfb9092149f3e0..7eb8a811e6934f 100644 --- a/lld/ELF/Relocations.h +++ b/lld/ELF/Relocations.h @@ -40,11 +40,14 @@ enum RelExpr { R_GOTPLT, R_GOTPLTREL, R_GOTREL, + R_GOTPLT_GOTREL, + R_GOTPLT_PC, R_NONE, R_PC, R_PLT, R_PLT_PC, R_PLT_GOTPLT, + R_PLT_GOTREL, R_RELAX_HINT, R_RELAX_GOT_PC, R_RELAX_GOT_PC_NOPIC, diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index dd69916d6b05e8..f0ede1f43bbdb3 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -445,6 +445,7 @@ static std::pair parseBfdName(StringRef s) { .Case("elf32-msp430", {ELF32LEKind, EM_MSP430}) .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH}) .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH}) + .Case("elf64-s390", {ELF64BEKind, EM_S390}) .Default({ELFNoneKind, EM_NONE}); } diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 4b413163314b2e..bada394aa30d7d 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -1419,6 +1419,9 @@ DynamicSection::computeContents() { case EM_MIPS: addInSec(DT_MIPS_PLTGOT, *in.gotPlt); break; + case EM_S390: + addInSec(DT_PLTGOT, *in.got); + break; case EM_SPARCV9: addInSec(DT_PLTGOT, *in.plt); break; diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp index 671d22cc66a0e9..b7922425a34e43 100644 --- a/lld/ELF/Target.cpp +++ b/lld/ELF/Target.cpp @@ -87,6 +87,8 @@ TargetInfo *elf::getTarget() { return getRISCVTargetInfo(); case EM_SPARCV9: return getSPARCV9TargetInfo(); + case EM_S390: + return getSystemZTargetInfo(); case EM_X86_64: return getX86_64TargetInfo(); } diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h index ab6b6b9c013ba3..0cefa318135662 100644 --- a/lld/ELF/Target.h +++ b/lld/ELF/Target.h @@ -95,6 +95,8 @@ class TargetInfo { // Do a linker relaxation pass and return true if we changed something. virtual bool relaxOnce(int pass) const { return false; } + // Do finalize relaxation after collecting relaxation infos. + virtual void finalizeRelax(int passes) const {} virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type, JumpModType val) const {} @@ -186,6 +188,7 @@ TargetInfo *getPPC64TargetInfo(); TargetInfo *getPPCTargetInfo(); TargetInfo *getRISCVTargetInfo(); TargetInfo *getSPARCV9TargetInfo(); +TargetInfo *getSystemZTargetInfo(); TargetInfo *getX86TargetInfo(); TargetInfo *getX86_64TargetInfo(); template TargetInfo *getMipsTargetInfo(); @@ -236,6 +239,7 @@ void addArmSyntheticSectionMappingSymbol(Defined *); void sortArmMappingSymbols(); void convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf); void createTaggedSymbols(const SmallVector &files); +void initSymbolAnchors(); LLVM_LIBRARY_VISIBILITY extern const TargetInfo *target; TargetInfo *getTarget(); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 6f66f3615fa4a2..8a08b0fcc90dbc 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -261,6 +261,9 @@ static void demoteDefined(Defined &sym, DenseMap &map) { Undefined(sym.file, sym.getName(), binding, sym.stOther, sym.type, /*discardedSecIdx=*/map.lookup(sym.section)) .overwrite(sym); + // Eliminate from the symbol table, otherwise we would leave an undefined + // symbol if the symbol is unreferenced in the absence of GC. + sym.isUsedInRegularObj = false; } // If all references to a DSO happen to be weak, the DSO is not added to @@ -1518,12 +1521,12 @@ template void Writer::sortSections() { if (auto *osd = dyn_cast(cmd)) osd->osec.sortRank = getSectionRank(osd->osec); if (!script->hasSectionsCommand) { - // We know that all the OutputSections are contiguous in this case. - auto isSection = [](SectionCommand *cmd) { return isa(cmd); }; - std::stable_sort( - llvm::find_if(script->sectionCommands, isSection), - llvm::find_if(llvm::reverse(script->sectionCommands), isSection).base(), - compareSections); + // OutputDescs are mostly contiguous, but may be interleaved with + // SymbolAssignments in the presence of INSERT commands. + auto mid = std::stable_partition( + script->sectionCommands.begin(), script->sectionCommands.end(), + [](SectionCommand *cmd) { return isa(cmd); }); + std::stable_sort(script->sectionCommands.begin(), mid, compareSections); } // Process INSERT commands and update output section attributes. From this @@ -1752,8 +1755,8 @@ template void Writer::finalizeAddressDependentContent() { } } } - if (!config->relocatable && config->emachine == EM_RISCV) - riscvFinalizeRelax(pass); + if (!config->relocatable) + target->finalizeRelax(pass); if (config->relocatable) for (OutputSection *sec : outputSections) diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp index 4752d92e3b1d71..7b16764dd2c7ce 100644 --- a/lld/MinGW/Driver.cpp +++ b/lld/MinGW/Driver.cpp @@ -448,6 +448,10 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, add("-lto-cs-profile-generate"); if (auto *arg = args.getLastArg(OPT_lto_cs_profile_file)) add("-lto-cs-profile-file:" + StringRef(arg->getValue())); + if (args.hasArg(OPT_plugin_opt_emit_llvm)) + add("-lldemit:llvm"); + if (args.hasArg(OPT_lto_emit_asm)) + add("-lldemit:asm"); if (auto *a = args.getLastArg(OPT_thinlto_cache_dir)) add("-lldltocache:" + StringRef(a->getValue())); diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td index 02f00f27406c08..9a0a96aac7f1c6 100644 --- a/lld/MinGW/Options.td +++ b/lld/MinGW/Options.td @@ -158,6 +158,8 @@ def lto_cs_profile_generate: FF<"lto-cs-profile-generate">, HelpText<"Perform context sensitive PGO instrumentation">; def lto_cs_profile_file: JJ<"lto-cs-profile-file=">, HelpText<"Context sensitive profile file path">; +def lto_emit_asm: FF<"lto-emit-asm">, + HelpText<"Emit assembly code">; def thinlto_cache_dir: JJ<"thinlto-cache-dir=">, HelpText<"Path to ThinLTO cached object file directory">; @@ -181,6 +183,9 @@ def: J<"plugin-opt=cs-profile-path=">, Alias, HelpText<"Alias for --lto-cs-profile-file">; def plugin_opt_dwo_dir_eq: J<"plugin-opt=dwo_dir=">, HelpText<"Directory to store .dwo files when LTO and debug fission are used">; +def plugin_opt_emit_asm: F<"plugin-opt=emit-asm">, + Alias, HelpText<"Alias for --lto-emit-asm">; +def plugin_opt_emit_llvm: F<"plugin-opt=emit-llvm">; def: J<"plugin-opt=jobs=">, Alias, HelpText<"Alias for --thinlto-jobs=">; def plugin_opt_mcpu_eq: J<"plugin-opt=mcpu=">; diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst index 01669543cd50ca..6ada711a20a6da 100644 --- a/lld/docs/ReleaseNotes.rst +++ b/lld/docs/ReleaseNotes.rst @@ -29,8 +29,50 @@ ELF Improvements * ``--fat-lto-objects`` option is added to support LLVM FatLTO. Without ``--fat-lto-objects``, LLD will link LLVM FatLTO objects using the relocatable object file. (`D146778 `_) +* ``-Bsymbolic-non-weak`` is added to directly bind non-weak definitions. + (`D158322 `_) +* ``--lto-validate-all-vtables-have-type-infos``, which complements + ``--lto-whole-program-visibility``, is added to disable unsafe whole-program + devirtualization. ``--lto-known-safe-vtables=`` can be used + to mark known-safe vtable symbols. + (`D155659 `_) +* ``--save-temps --lto-emit-asm`` now derives ELF/asm file names from bitcode file names. + ``ld.lld --save-temps a.o d/b.o -o out`` will create ELF relocatable files + ``out.lto.a.o``/``d/out.lto.b.o`` instead of ``out1.lto.o``/``out2.lto.o``. + (`#78835 `_) +* ``--no-allow-shlib-undefined`` now reports errors for DSO referencing + non-exported definitions. + (`#70769 `_) * common-page-size can now be larger than the system page-size. (`#57618 `_) +* When call graph profile information is available due to instrumentation or + sample PGO, input sections are now sorted using the new ``cdsort`` algorithm, + better than the previous ``hfsort`` algorithm. + (`D152840 `_) +* Symbol assignments like ``a = DEFINED(a) ? a : 0;`` are now handled. + (`#65866 `_) +* ``OVERLAY`` now supports optional start address and LMA + (`#77272 `_) +* Relocations referencing a symbol defined in ``/DISCARD/`` section now lead to + an error. + (`#69295 `_) +* For AArch64 MTE, global variable descriptors have been implemented. + (`D152921 `_) +* ``R_AARCH64_GOTPCREL32`` is now supported. + (`#72584 `_) +* ``R_LARCH_PCREL20_S2``/``R_LARCH_ADD6``/``R_LARCH_CALL36`` and extreme code + model relocations are now supported. +* ``--emit-relocs`` is now supported for RISC-V linker relaxation. + (`D159082 `_) +* Call relaxation respects RVC when mixing +c and -c relocatable files. + (`#73977 `_) +* ``R_RISCV_GOT32_PCREL`` is now supported. + (`#72587 `_) +* ``R_RISCV_SET_ULEB128``/``R_RISCV_SUB_ULEB128`` relocations are now supported. + (`#72610 `_) + (`#77261 `_) +* RISC-V TLSDESC is now supported. + (`#79239 `_) Breaking changes ---------------- @@ -40,10 +82,77 @@ COFF Improvements * Added support for ``--time-trace`` and associated ``--time-trace-granularity``. This generates a .json profile trace of the linker execution. + (`#68236 `_) + +* The ``-dependentloadflag`` option was implemented. + (`#71537 `_) + +* LLD now prefers library paths specified with ``-libpath:`` over the implicitly + detected toolchain paths. + (`#78039 `_) + +* Added new options ``-lldemit:llvm`` and ``-lldemit:asm`` for getting + the output of LTO compilation as LLVM bitcode or assembly. + (`#66964 `_) + (`#67079 `_) + +* Added a new option ``-build-id`` for generating a ``.buildid`` section + when not generating a PDB. A new symbol ``__buildid`` is generated by + the linker, allowing code to reference the build ID of the binary. + (`#71433 `_) + (`#74652 `_) + +* A new, LLD specific option, ``-lld-allow-duplicate-weak``, was added + for allowing duplicate weak symbols. + (`#68077 `_) + +* More correctly handle LTO of files that define ``__imp_`` prefixed dllimport + redirections. + (`#70777 `_) + (`#71376 `_) + (`#72989 `_) + +* Linking undefined references to weak symbols with LTO now works. + (`#70430 `_) + +* Use the ``SOURCE_DATE_EPOCH`` environment variable for the PE header and + debug directory timestamps, if neither the ``/Brepro`` nor ``/timestamp:`` + options have been specified. This makes the linker output reproducible by + setting this environment variable. + (`#81326 `_) + +* Lots of incremental work towards supporting linking ARM64EC binaries. MinGW Improvements ------------------ +* Added support for many LTO and ThinLTO options (most LTO options supported + by the ELF driver, that are implemented by the COFF backend as well, + should be supported now). + (`D158412 `_) + (`D158887 `_) + (`#77387 `_) + (`#81475 `_) + +* LLD no longer tries to autodetect and use library paths from MSVC/WinSDK + installations when run in MinGW mode; that mode of operation shouldn't + ever be needed in MinGW mode, and could be a source of unexpected + behaviours. + (`D144084 `_) + +* The ``--icf=safe`` option now works as expected; it was previously a no-op. + (`#70037 `_) + +* The strip flags ``-S`` and ``-s`` now can be used to strip out DWARF debug + info and symbol tables while emitting a PDB debug info file. + (`#75181 `_) + +* The option ``--dll`` is handled as an alias for the ``--shared`` option. + (`#68575 `_) + +* The option ``--sort-common`` is ignored now. + (`#66336 `_) + MachO Improvements ------------------ @@ -54,5 +163,10 @@ WebAssembly Improvements is read from object files within the archive. This matches the behaviour of the ELF linker. +SystemZ +------- + +* Add target support for SystemZ (s390x). + Fixes ##### diff --git a/lld/test/COFF/autoimport-gc.s b/lld/test/COFF/autoimport-gc.s new file mode 100644 index 00000000000000..fef6c02eba82f9 --- /dev/null +++ b/lld/test/COFF/autoimport-gc.s @@ -0,0 +1,41 @@ +# REQUIRES: x86 +# RUN: split-file %s %t.dir + +# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/lib.s -filetype=obj -o %t.dir/lib.obj +# RUN: lld-link -out:%t.dir/lib.dll -dll -entry:DllMainCRTStartup %t.dir/lib.obj -lldmingw -implib:%t.dir/lib.lib + +# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/main.s -filetype=obj -o %t.dir/main.obj +# RUN: lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/lib.lib -opt:ref -debug:dwarf + +#--- main.s + .global main + .section .text$main,"xr",one_only,main +main: + ret + + .global other + .section .text$other,"xr",one_only,other +other: + movq .refptr.variable(%rip), %rax + movl (%rax), %eax + ret + + .section .rdata$.refptr.variable,"dr",discard,.refptr.variable + .global .refptr.variable +.refptr.variable: + .quad variable + + .section .debug_info + .long 1 + .quad variable + .long 2 + +#--- lib.s + .global variable + .global DllMainCRTStartup + .text +DllMainCRTStartup: + ret + .data +variable: + .long 42 diff --git a/lld/test/COFF/def-export-cpp.s b/lld/test/COFF/def-export-cpp.s index e00b35b1c5b39b..370b8ddba4104b 100644 --- a/lld/test/COFF/def-export-cpp.s +++ b/lld/test/COFF/def-export-cpp.s @@ -10,6 +10,7 @@ # IMPLIB: File: foo.dll # IMPLIB: Name type: undecorate +# IMPLIB-NEXT: Export name: GetPathOnDisk # IMPLIB-NEXT: Symbol: __imp_?GetPathOnDisk@@YA_NPEA_W@Z # IMPLIB-NEXT: Symbol: ?GetPathOnDisk@@YA_NPEA_W@Z diff --git a/lld/test/COFF/def-export-stdcall.s b/lld/test/COFF/def-export-stdcall.s index f015e205c74a33..7e4e04c77cbe7a 100644 --- a/lld/test/COFF/def-export-stdcall.s +++ b/lld/test/COFF/def-export-stdcall.s @@ -6,15 +6,19 @@ # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix UNDECORATED-EXPORTS %s # UNDECORATED-IMPLIB: Name type: noprefix +# UNDECORATED-IMPLIB-NEXT: Export name: _underscored # UNDECORATED-IMPLIB-NEXT: __imp___underscored # UNDECORATED-IMPLIB-NEXT: __underscored # UNDECORATED-IMPLIB: Name type: undecorate +# UNDECORATED-IMPLIB-NEXT: Export name: fastcall # UNDECORATED-IMPLIB-NEXT: __imp_@fastcall@8 # UNDECORATED-IMPLIB-NEXT: fastcall@8 # UNDECORATED-IMPLIB: Name type: undecorate +# UNDECORATED-IMPLIB-NEXT: Export name: stdcall # UNDECORATED-IMPLIB-NEXT: __imp__stdcall@8 # UNDECORATED-IMPLIB-NEXT: _stdcall@8 # UNDECORATED-IMPLIB: Name type: undecorate +# UNDECORATED-IMPLIB-NEXT: Export name: vectorcall # UNDECORATED-IMPLIB-NEXT: __imp_vectorcall@@8 # UNDECORATED-IMPLIB-NEXT: vectorcall@@8 @@ -30,12 +34,15 @@ # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix DECORATED-EXPORTS %s # DECORATED-IMPLIB: Name type: name +# DECORATED-IMPLIB-NEXT: Export name: @fastcall@8 # DECORATED-IMPLIB-NEXT: __imp_@fastcall@8 # DECORATED-IMPLIB-NEXT: @fastcall@8 # DECORATED-IMPLIB: Name type: name +# DECORATED-IMPLIB-NEXT: Export name: _stdcall@8 # DECORATED-IMPLIB-NEXT: __imp__stdcall@8 # DECORATED-IMPLIB-NEXT: _stdcall@8 # DECORATED-IMPLIB: Name type: name +# DECORATED-IMPLIB-NEXT: Export name: vectorcall@@8 # DECORATED-IMPLIB-NEXT: __imp_vectorcall@@8 # DECORATED-IMPLIB-NEXT: vectorcall@@8 @@ -51,14 +58,17 @@ # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix DECORATED-MINGW-EXPORTS %s # DECORATED-MINGW-IMPLIB: Name type: name +# DECORATED-MINGW-IMPLIB-NEXT: Export name: @fastcall@8 # DECORATED-MINGW-IMPLIB-NEXT: __imp_@fastcall@8 # DECORATED-MINGW-IMPLIB-NEXT: fastcall@8 # DECORATED-MINGW-IMPLIB: Name type: noprefix +# DECORATED-MINGW-IMPLIB-NEXT: Export name: stdcall@8 # DECORATED-MINGW-IMPLIB-NEXT: __imp__stdcall@8 # DECORATED-MINGW-IMPLIB-NEXT: _stdcall@8 # GNU tools don't support vectorcall, but this test is just to track that # lld's behaviour remains consistent over time. # DECORATED-MINGW-IMPLIB: Name type: name +# DECORATED-MINGW-IMPLIB-NEXT: Export name: vectorcall@@8 # DECORATED-MINGW-IMPLIB-NEXT: __imp_vectorcall@@8 # DECORATED-MINGW-IMPLIB-NEXT: vectorcall@@8 @@ -75,14 +85,17 @@ # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix MINGW-KILL-AT-EXPORTS %s # MINGW-KILL-AT-IMPLIB: Name type: noprefix +# MINGW-KILL-AT-IMPLIB: Export name: fastcall # MINGW-KILL-AT-IMPLIB: __imp__fastcall # MINGW-KILL-AT-IMPLIB-NEXT: _fastcall # MINGW-KILL-AT-IMPLIB: Name type: noprefix +# MINGW-KILL-AT-IMPLIB-NEXT: Export name: stdcall # MINGW-KILL-AT-IMPLIB-NEXT: __imp__stdcall # MINGW-KILL-AT-IMPLIB-NEXT: _stdcall # GNU tools don't support vectorcall, but this test is just to track that # lld's behaviour remains consistent over time. # MINGW-KILL-AT-IMPLIB: Name type: noprefix +# MINGW-KILL-AT-IMPLIB-NEXT: Export name: vectorcall # MINGW-KILL-AT-IMPLIB-NEXT: __imp__vectorcall # MINGW-KILL-AT-IMPLIB-NEXT: _vectorcall diff --git a/lld/test/COFF/delayimports-armnt.yaml b/lld/test/COFF/delayimports-armnt.yaml index 7d9bc38c5c3606..ea96d864ef53d5 100644 --- a/lld/test/COFF/delayimports-armnt.yaml +++ b/lld/test/COFF/delayimports-armnt.yaml @@ -6,6 +6,7 @@ # RUN: llvm-readobj --coff-imports %t.exe | FileCheck -check-prefix=IMPORT %s # RUN: llvm-readobj --coff-basereloc %t.exe | FileCheck -check-prefix=BASEREL %s # RUN: llvm-objdump --no-print-imm-hex -d %t.exe | FileCheck --check-prefix=DISASM %s +# RUN: llvm-readobj --file-headers %t.exe | FileCheck -check-prefix=DIR %s # IMPORT: Format: COFF-ARM # IMPORT-NEXT: Arch: thumb @@ -13,9 +14,9 @@ # IMPORT-NEXT: DelayImport { # IMPORT-NEXT: Name: library.dll # IMPORT-NEXT: Attributes: 0x1 -# IMPORT-NEXT: ModuleHandle: 0x3000 -# IMPORT-NEXT: ImportAddressTable: 0x3008 -# IMPORT-NEXT: ImportNameTable: 0x2040 +# IMPORT-NEXT: ModuleHandle: 0x3008 +# IMPORT-NEXT: ImportAddressTable: 0x3010 +# IMPORT-NEXT: ImportNameTable: 0x2044 # IMPORT-NEXT: BoundDelayImportTable: 0x0 # IMPORT-NEXT: UnloadDelayImportTable: 0x0 # IMPORT-NEXT: Import { @@ -43,7 +44,7 @@ # BASEREL-NEXT: } # BASEREL-NEXT: Entry { # BASEREL-NEXT: Type: HIGHLOW -# BASEREL-NEXT: Address: 0x3008 +# BASEREL-NEXT: Address: 0x3010 # BASEREL-NEXT: } # BASEREL-NEXT: Entry { # BASEREL-NEXT: Type: ABSOLUTE @@ -52,20 +53,24 @@ # BASEREL-NEXT: ] # # DISASM: 00401000 <.text>: -# DISASM: 40100c: f243 0c08 movw r12, #12296 +# DISASM: 40100c: f243 0c10 movw r12, #12304 # DISASM-NEXT: f2c0 0c40 movt r12, #64 # DISASM-NEXT: f000 b800 b.w {{.+}} @ imm = #0 # DISASM-NEXT: e92d 480f push.w {r0, r1, r2, r3, r11, lr} # DISASM-NEXT: f20d 0b10 addw r11, sp, #16 # DISASM-NEXT: ed2d 0b10 vpush {d0, d1, d2, d3, d4, d5, d6, d7} # DISASM-NEXT: 4661 mov r1, r12 -# DISASM-NEXT: f242 0000 movw r0, #8192 +# DISASM-NEXT: f242 0004 movw r0, #8196 # DISASM-NEXT: f2c0 0040 movt r0, #64 # DISASM-NEXT: f7ff ffe7 bl 0x401000 <.text> # DISASM-NEXT: 4684 mov r12, r0 # DISASM-NEXT: ecbd 0b10 vpop {d0, d1, d2, d3, d4, d5, d6, d7} # DISASM-NEXT: e8bd 480f pop.w {r0, r1, r2, r3, r11, lr} # DISASM-NEXT: 4760 bx r12 +# +# DIR: DelayImportDescriptorRVA: 0x2004 +# DIR-NEXT: DelayImportDescriptorSize: 0x40 + --- !COFF header: @@ -80,6 +85,14 @@ sections: - VirtualAddress: 0 SymbolName: __imp_function Type: IMAGE_REL_ARM_MOV32T + - Name: .rdata + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ] + Alignment: 1 + SectionData: 01 + - Name: .data + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ] + Alignment: 1 + SectionData: 02 symbols: - Name: .text Value: 0 diff --git a/lld/test/COFF/dllexport.s b/lld/test/COFF/dllexport.s index a238b70ce1b4f6..b04ebc3a33c3e2 100644 --- a/lld/test/COFF/dllexport.s +++ b/lld/test/COFF/dllexport.s @@ -6,15 +6,19 @@ # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix DECORATED-EXPORTS %s # DECORATED-IMPLIB: Name type: name +# DECORATED-IMPLIB-NEXT: Export name: @fastcall@8 # DECORATED-IMPLIB-NEXT: __imp_@fastcall@8 # DECORATED-IMPLIB-NEXT: @fastcall@8 # DECORATED-IMPLIB: Name type: name +# DECORATED-IMPLIB-NEXT: Export name: _stdcall@8 # DECORATED-IMPLIB-NEXT: __imp__stdcall@8 # DECORATED-IMPLIB-NEXT: _stdcall@8 # DECORATED-IMPLIB: Name type: noprefix +# DECORATED-IMPLIB-NEXT: Export name: _underscored # DECORATED-IMPLIB-NEXT: __imp___underscored # DECORATED-IMPLIB-NEXT: __underscored # DECORATED-IMPLIB: Name type: name +# DECORATED-IMPLIB-NEXT: Export name: vectorcall@@8 # DECORATED-IMPLIB-NEXT: __imp_vectorcall@@8 # DECORATED-IMPLIB-NEXT: vectorcall@@8 diff --git a/lld/test/COFF/lto-cache-errors.ll b/lld/test/COFF/lto-cache-errors.ll index 55244e5690dc34..a46190a81b6230 100644 --- a/lld/test/COFF/lto-cache-errors.ll +++ b/lld/test/COFF/lto-cache-errors.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ;; Not supported on windows since we use permissions to deny the creation ; UNSUPPORTED: system-windows diff --git a/lld/test/COFF/thinlto-emit-imports.ll b/lld/test/COFF/thinlto-emit-imports.ll index a9f22c1dc2dcff..b47a6cea4eb7df 100644 --- a/lld/test/COFF/thinlto-emit-imports.ll +++ b/lld/test/COFF/thinlto-emit-imports.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ; Generate summary sections and test lld handling. ; RUN: opt -module-summary %s -o %t1.obj diff --git a/lld/test/COFF/timestamp.test b/lld/test/COFF/timestamp.test index fbdc5788a33a55..cc73af13c38ca6 100644 --- a/lld/test/COFF/timestamp.test +++ b/lld/test/COFF/timestamp.test @@ -3,9 +3,28 @@ RUN: yaml2obj %p/Inputs/generic.yaml -o %t.obj RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.1.exe RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.2.exe RUN: lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.3.exe +RUN: env SOURCE_DATE_EPOCH=0 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.4.exe +# Test timestamps corresponding to INT32_TMAX +RUN: lld-link %t.obj /debug /timestamp:2147483647 /entry:main /nodefaultlib /out:%t.5.exe +RUN: env SOURCE_DATE_EPOCH=2147483647 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.6.exe +# Test that the command line option /timestamp has precedence over SOURCE_DATE_EPOCH +RUN: env SOURCE_DATE_EPOCH=12345 lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.7.exe +# Test timestamps corresponding to UINT32_TMAX +RUN: lld-link %t.obj /debug /timestamp:4294967295 /entry:main /nodefaultlib /out:%t.8.exe +RUN: env SOURCE_DATE_EPOCH=4294967295 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.9.exe +# Test that setting UINT32_MAX+1 as timestamp fails. +RUN: env LLD_IN_TEST=1 not lld-link %t.obj /debug /timestamp:4294967296 /entry:main /nodefaultlib /out:%t.10.exe 2>&1 | FileCheck %s --check-prefix=ERROR +RUN: env SOURCE_DATE_EPOCH=4294967296 env LLD_IN_TEST=1 not lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.11.exe 2>&1 | FileCheck %s --check-prefix=ERROR2 RUN: llvm-readobj --file-headers --coff-debug-directory %t.1.exe | FileCheck %s --check-prefix=HASH RUN: llvm-readobj --file-headers --coff-debug-directory %t.2.exe | FileCheck %s --check-prefix=HASH RUN: llvm-readobj --file-headers --coff-debug-directory %t.3.exe | FileCheck %s --check-prefix=ZERO +RUN: llvm-readobj --file-headers --coff-debug-directory %t.4.exe | FileCheck %s --check-prefix=ZERO +RUN: llvm-readobj --file-headers --coff-debug-directory %t.5.exe | FileCheck %s --check-prefix=LARGE +RUN: llvm-readobj --file-headers --coff-debug-directory %t.6.exe | FileCheck %s --check-prefix=LARGE +RUN: llvm-readobj --file-headers --coff-debug-directory %t.7.exe | FileCheck %s --check-prefix=ZERO + +# Not inspecting %t.8.exe and %t.9.exe; llvm-readobj with a 32 bit time_t fails to print dates +# past INT32_MAX correctly. HASH: ImageFileHeader { HASH: TimeDateStamp: [[STAMP:.*]] @@ -16,3 +35,11 @@ ZERO: ImageFileHeader { ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0) ZERO: DebugDirectory [ ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0) + +LARGE: ImageFileHeader { +LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF) +LARGE: DebugDirectory [ +LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF) + +ERROR: error: invalid timestamp: 4294967296. Expected 32-bit integer +ERROR2: error: invalid SOURCE_DATE_EPOCH timestamp: 4294967296. Expected 32-bit integer diff --git a/lld/test/ELF/Inputs/systemz-init.s b/lld/test/ELF/Inputs/systemz-init.s new file mode 100644 index 00000000000000..1611b69b4419e3 --- /dev/null +++ b/lld/test/ELF/Inputs/systemz-init.s @@ -0,0 +1,5 @@ +// glibc < 2.39 used to align .init and .fini code at a 4-byte boundary. +// This file aims to recreate that behavior. + .section .init,"ax",@progbits + .align 4 + lg %r4, 272(%r15) diff --git a/lld/test/ELF/basic-systemz.s b/lld/test/ELF/basic-systemz.s new file mode 100644 index 00000000000000..f7bb0e8cbd020d --- /dev/null +++ b/lld/test/ELF/basic-systemz.s @@ -0,0 +1,63 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld --hash-style=sysv -discard-all -shared %t.o -o %t.so +# RUN: llvm-readelf --file-header --program-headers --section-headers --dynamic-table %t.so | FileCheck %s + +# Exits with return code 55 on linux. +.text + lghi 2,55 + svc 1 + +# CHECK: ELF Header: +# CHECK-NEXT: Magic: 7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00 +# CHECK-NEXT: Class: ELF64 +# CHECK-NEXT: Data: 2's complement, big endian +# CHECK-NEXT: Version: 1 (current) +# CHECK-NEXT: OS/ABI: UNIX - System V +# CHECK-NEXT: ABI Version: 0 +# CHECK-NEXT: Type: DYN (Shared object file) +# CHECK-NEXT: Machine: IBM S/390 +# CHECK-NEXT: Version: 0x1 +# CHECK-NEXT: Entry point address: 0x0 +# CHECK-NEXT: Start of program headers: 64 (bytes into file) +# CHECK-NEXT: Start of section headers: 768 (bytes into file) +# CHECK-NEXT: Flags: 0x0 +# CHECK-NEXT: Size of this header: 64 (bytes) +# CHECK-NEXT: Size of program headers: 56 (bytes) +# CHECK-NEXT: Number of program headers: 7 +# CHECK-NEXT: Size of section headers: 64 (bytes) +# CHECK-NEXT: Number of section headers: 11 +# CHECK-NEXT: Section header string table index: 9 + +# CHECK: Section Headers: +# CHECK-NEXT: [Nr] Name Type Address Off Size ES Flg Lk Inf Al +# CHECK-NEXT: [ 0] NULL 0000000000000000 000000 000000 00 0 0 0 +# CHECK-NEXT: [ 1] .dynsym DYNSYM 00000000000001c8 0001c8 000018 18 A 3 1 8 +# CHECK-NEXT: [ 2] .hash HASH 00000000000001e0 0001e0 000010 04 A 1 0 4 +# CHECK-NEXT: [ 3] .dynstr STRTAB 00000000000001f0 0001f0 000001 00 A 0 0 1 +# CHECK-NEXT: [ 4] .text PROGBITS 00000000000011f4 0001f4 000006 00 AX 0 0 4 +# CHECK-NEXT: [ 5] .dynamic DYNAMIC 0000000000002200 000200 000060 10 WA 3 0 8 +# CHECK-NEXT: [ 6] .relro_padding NOBITS 0000000000002260 000260 000da0 00 WA 0 0 1 +# CHECK-NEXT: [ 7] .comment PROGBITS 0000000000000000 000260 000008 01 MS 0 0 1 +# CHECK-NEXT: [ 8] .symtab SYMTAB 0000000000000000 000268 000030 18 10 2 8 +# CHECK-NEXT: [ 9] .shstrtab STRTAB 0000000000000000 000298 000058 00 0 0 1 +# CHECK-NEXT: [10] .strtab STRTAB 0000000000000000 0002f0 00000a 00 0 0 1 + +# CHECK: Program Headers: +# CHECK-NEXT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# CHECK-NEXT: PHDR 0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R 0x8 +# CHECK-NEXT: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R 0x1000 +# CHECK-NEXT: LOAD 0x0001f4 0x00000000000011f4 0x00000000000011f4 0x000006 0x000006 R E 0x1000 +# CHECK-NEXT: LOAD 0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 RW 0x1000 +# CHECK-NEXT: DYNAMIC 0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000060 RW 0x8 +# CHECK-NEXT: GNU_RELRO 0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 R 0x1 +# CHECK-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0x0 + +# CHECK: Dynamic section at offset 0x200 contains 6 entries: +# CHECK-NEXT: Tag Type Name/Value +# CHECK-NEXT: 0x0000000000000006 (SYMTAB) 0x1c8 +# CHECK-NEXT: 0x000000000000000b (SYMENT) 24 (bytes) +# CHECK-NEXT: 0x0000000000000005 (STRTAB) 0x1f0 +# CHECK-NEXT: 0x000000000000000a (STRSZ) 1 (bytes) +# CHECK-NEXT: 0x0000000000000004 (HASH) 0x1e0 +# CHECK-NEXT: 0x0000000000000000 (NULL) 0x0 diff --git a/lld/test/ELF/dead-reloc-in-nonalloc.s b/lld/test/ELF/dead-reloc-in-nonalloc.s index 145604eb883a9a..b675fc50fc2ea2 100644 --- a/lld/test/ELF/dead-reloc-in-nonalloc.s +++ b/lld/test/ELF/dead-reloc-in-nonalloc.s @@ -17,7 +17,7 @@ # AA: Contents of section .debug_info: # AA-NEXT: 0000 [[ADDR]] 00000000 aaaaaaaa 00000000 # AA: Contents of section .not_debug: -# AA-NEXT: 0000 bbbbbbbb bbbbbbbb 00000000 . +# AA-NEXT: 0000 bbbbbbbb 2a000000 00000000 . ## Specifying zero can get a behavior similar to GNU ld. # RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc=.debug_info=0 %t.o %tabs.o -o %tzero diff --git a/lld/test/ELF/emulation-systemz.s b/lld/test/ELF/emulation-systemz.s new file mode 100644 index 00000000000000..dfdb4620954c8a --- /dev/null +++ b/lld/test/ELF/emulation-systemz.s @@ -0,0 +1,29 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld -m elf64_s390 %t.o -o %t1 +# RUN: llvm-readelf --file-header %t1 | FileCheck %s +# RUN: ld.lld %t.o -o %t2 +# RUN: llvm-readelf --file-header %t2 | FileCheck %s +# RUN: echo 'OUTPUT_FORMAT(elf64-s390)' > %t.script +# RUN: ld.lld %t.script %t.o -o %t3 +# RUN: llvm-readelf --file-header %t3 | FileCheck %s + +# CHECK: ELF Header: +# CHECK-NEXT: Magic: 7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00 +# CHECK-NEXT: Class: ELF64 +# CHECK-NEXT: Data: 2's complement, big endian +# CHECK-NEXT: Version: 1 (current) +# CHECK-NEXT: OS/ABI: UNIX - System V +# CHECK-NEXT: ABI Version: 0 +# CHECK-NEXT: Type: EXEC (Executable file) +# CHECK-NEXT: Machine: IBM S/390 +# CHECK-NEXT: Version: 0x1 +# CHECK-NEXT: Entry point address: +# CHECK-NEXT: Start of program headers: 64 (bytes into file) +# CHECK-NEXT: Start of section headers: +# CHECK-NEXT: Flags: 0x0 +# CHECK-NEXT: Size of this header: 64 (bytes) +# CHECK-NEXT: Size of program headers: 56 (bytes) + +.globl _start +_start: diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s index 24f3b2b73e991f..0bbebac59bb345 100644 --- a/lld/test/ELF/linkerscript/discard-section.s +++ b/lld/test/ELF/linkerscript/discard-section.s @@ -9,6 +9,9 @@ # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning: # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC +# RUN: ld.lld -r --gc-sections -T a.lds a.o b.o -o a.gc.ro --no-fatal-warnings +# RUN: llvm-readelf -r -s a.gc.ro | FileCheck %s --check-prefix=RELOC-GC + # LOCAL: error: relocation refers to a discarded section: .aaa # LOCAL-NEXT: >>> defined in a.o # LOCAL-NEXT: >>> referenced by a.o:(.bbb+0x0) @@ -32,16 +35,18 @@ # WARNING: warning: relocation refers to a discarded section: .aaa # WARNING-NEXT: >>> referenced by a.o:(.rela.bbb+0x0) +## GNU ld reports "defined in discarded secion" errors even in -r mode. +## We set the symbol index to 0. # RELOC: Relocation section '.rela.bbb' at offset {{.*}} contains 1 entries: # RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend # RELOC-NEXT: 0000000000000000 0000000000000000 R_X86_64_NONE 0 # RELOC-EMPTY: # RELOC-NEXT: Relocation section '.rela.data' at offset {{.*}} contains 4 entries: # RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend -# RELOC-NEXT: 0000000000000000 0000000500000001 R_X86_64_64 0000000000000000 global + 0 -# RELOC-NEXT: 0000000000000008 0000000700000001 R_X86_64_64 0000000000000000 weak + 0 -# RELOC-NEXT: 0000000000000010 0000000600000001 R_X86_64_64 0000000000000000 weakref1 + 0 -# RELOC-NEXT: 0000000000000018 0000000800000001 R_X86_64_64 0000000000000000 weakref2 + 0 +# RELOC-NEXT: 0000000000000000 0000000000000001 R_X86_64_64 0 +# RELOC-NEXT: 0000000000000008 0000000000000001 R_X86_64_64 0 +# RELOC-NEXT: 0000000000000010 0000000000000001 R_X86_64_64 0 +# RELOC-NEXT: 0000000000000018 0000000000000001 R_X86_64_64 0 # RELOC: Num: Value Size Type Bind Vis Ndx Name # RELOC-NEXT: 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND @@ -49,23 +54,25 @@ # RELOC-NEXT: 2: 0000000000000000 0 SECTION LOCAL DEFAULT 2 .bbb # RELOC-NEXT: 3: 0000000000000000 0 SECTION LOCAL DEFAULT 4 .data # RELOC-NEXT: 4: 0000000000000000 0 NOTYPE GLOBAL DEFAULT 1 _start -# RELOC-NEXT: 5: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND global -# RELOC-NEXT: 6: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND weakref1 -# RELOC-NEXT: 7: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND weak -# RELOC-NEXT: 8: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND weakref2 # RELOC-EMPTY: +# RELOC-GC: There are no relocations in this file. + #--- a.s .globl _start _start: .section .aaa,"a" -.globl global, weakref1 +.globl global, weakref1, unused .weak weak, weakref2 global: weak: weakref1: weakref2: +## Eliminate `unused` just like GC discarded definitions. +## Linux kernel's CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y configuration expects +## that the unreferenced `unused` is not emitted to .symtab. +unused: .quad 0 .section .bbb,"aw" diff --git a/lld/test/ELF/linkerscript/insert-before.test b/lld/test/ELF/linkerscript/insert-before.test index e6ed413639827a..a72834988007ce 100644 --- a/lld/test/ELF/linkerscript/insert-before.test +++ b/lld/test/ELF/linkerscript/insert-before.test @@ -24,8 +24,9 @@ ## without making more layout changes. Address/offset assignments are different ## with a main linker script. -# RUN: ld.lld --script %s %t1.o -o %t2 -# RUN: llvm-readelf -S -l %t2 | FileCheck --check-prefix=CHECK2 %s +## Test non-contiguous OutputDescs in script->sectionCommands. +# RUN: ld.lld --defsym y0=1 %s --defsym y1=1 %t1.o -o %t2 +# RUN: llvm-readelf -S -l -sX %t2 | FileCheck --check-prefix=CHECK2 %s # CHECK2: Name Type Address Off Size ES Flg # CHECK2-NEXT: NULL # CHECK2-NEXT: .foo.text PROGBITS 000000000020{{.*}} [[#%x,]] 000008 00 AX @@ -40,9 +41,13 @@ # CHECK2-NEXT: LOAD {{.*}} RW 0x1000 # CHECK2-NEXT: GNU_STACK {{.*}} RW 0 +# CHECK2: NOTYPE GLOBAL DEFAULT ABS y0 +# CHECK2: NOTYPE GLOBAL DEFAULT [[#]] (.foo.text) x0 +# CHECK2: NOTYPE GLOBAL DEFAULT ABS y1 + SECTIONS { .byte : { BYTE(0) } } INSERT BEFORE .data; SECTIONS { .foo.data : { *(.foo.data) } } INSERT BEFORE .data; ## The input section .foo.text is an orphan. It will be placed in .foo.text -SECTIONS { .foo.text : {} } INSERT BEFORE .text; +SECTIONS { .foo.text : { x0 = .; } } INSERT BEFORE .text; diff --git a/lld/test/ELF/loongarch-relax-align.s b/lld/test/ELF/loongarch-relax-align.s new file mode 100644 index 00000000000000..ab61e15d5caca2 --- /dev/null +++ b/lld/test/ELF/loongarch-relax-align.s @@ -0,0 +1,126 @@ +# REQUIRES: loongarch + +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax %s -o %t.32.o +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o +# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.32.o -o %t.32 +# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.64.o -o %t.64 +# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.32.o --no-relax -o %t.32n +# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.64.o --no-relax -o %t.64n +# RUN: llvm-objdump -td --no-show-raw-insn %t.32 | FileCheck %s +# RUN: llvm-objdump -td --no-show-raw-insn %t.64 | FileCheck %s +# RUN: llvm-objdump -td --no-show-raw-insn %t.32n | FileCheck %s +# RUN: llvm-objdump -td --no-show-raw-insn %t.64n | FileCheck %s + +## Test the R_LARCH_ALIGN without symbol index. +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.o64.o --defsym=old=1 +# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.o64.o -o %t.o64 +# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.o64.o --no-relax -o %t.o64n +# RUN: llvm-objdump -td --no-show-raw-insn %t.o64 | FileCheck %s +# RUN: llvm-objdump -td --no-show-raw-insn %t.o64n | FileCheck %s + +## -r keeps section contents unchanged. +# RUN: ld.lld -r %t.64.o -o %t.64.r +# RUN: llvm-objdump -dr --no-show-raw-insn %t.64.r | FileCheck %s --check-prefix=CHECKR + +# CHECK-DAG: {{0*}}10000 l .text {{0*}}44 .Ltext_start +# CHECK-DAG: {{0*}}10038 l .text {{0*}}0c .L1 +# CHECK-DAG: {{0*}}10040 l .text {{0*}}04 .L2 +# CHECK-DAG: {{0*}}20000 l .text2 {{0*}}14 .Ltext2_start + +# CHECK: <.Ltext_start>: +# CHECK-NEXT: break 1 +# CHECK-NEXT: break 2 +# CHECK-NEXT: nop +# CHECK-NEXT: nop +# CHECK-NEXT: break 3 +# CHECK-NEXT: break 4 +# CHECK-NEXT: nop +# CHECK-NEXT: nop +# CHECK-NEXT: pcalau12i $a0, 0 +# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 0 +# CHECK-NEXT: pcalau12i $a0, 0 +# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 56 +# CHECK-NEXT: pcalau12i $a0, 0 +# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 64 +# CHECK-EMPTY: +# CHECK-NEXT: <.L1>: +# CHECK-NEXT: nop +# CHECK-NEXT: nop +# CHECK-EMPTY: +# CHECK-NEXT: <.L2>: +# CHECK-NEXT: break 5 + +# CHECK: <.Ltext2_start>: +# CHECK-NEXT: pcalau12i $a0, 0 +# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 0 +# CHECK-NEXT: nop +# CHECK-NEXT: nop +# CHECK-NEXT: break 6 + +# CHECKR: <.Ltext2_start>: +# CHECKR-NEXT: pcalau12i $a0, 0 +# CHECKR-NEXT: {{0*}}00: R_LARCH_PCALA_HI20 .Ltext2_start +# CHECKR-NEXT: {{0*}}00: R_LARCH_RELAX *ABS* +# CHECKR-NEXT: addi.d $a0, $a0, 0 +# CHECKR-NEXT: {{0*}}04: R_LARCH_PCALA_LO12 .Ltext2_start +# CHECKR-NEXT: {{0*}}04: R_LARCH_RELAX *ABS* +# CHECKR-NEXT: nop +# CHECKR-NEXT: {{0*}}08: R_LARCH_ALIGN .Lalign_symbol+0x4 +# CHECKR-NEXT: nop +# CHECKR-NEXT: nop +# CHECKR-NEXT: break 6 + +.macro .fake_p2align_4 max=0 + .ifdef old + .if \max==0 + .reloc ., R_LARCH_ALIGN, 0xc + nop; nop; nop + .endif + .else + .reloc ., R_LARCH_ALIGN, .Lalign_symbol + 0x4 + (\max << 8) + nop; nop; nop + .endif +.endm + + .text +.Lalign_symbol: +.Ltext_start: + break 1 + break 2 +## +0x8: Emit 2 nops, delete 1 nop. + .fake_p2align_4 + + break 3 +## +0x14: Emit 3 nops > 8 bytes, not emit. + .fake_p2align_4 8 + + break 4 + .fake_p2align_4 8 +## +0x18: Emit 2 nops <= 8 bytes. + +## Compensate +.ifdef old + nop; nop +.endif + +## +0x20: Test symbol value and symbol size can be handled. + la.pcrel $a0, .Ltext_start + la.pcrel $a0, .L1 + la.pcrel $a0, .L2 + +## +0x38: Emit 2 nops, delete 1 nop. +.L1: + .fake_p2align_4 +.L2: + break 5 + .size .L1, . - .L1 + .size .L2, . - .L2 + .size .Ltext_start, . - .Ltext_start + +## Test another text section. + .section .text2,"ax",@progbits +.Ltext2_start: + la.pcrel $a0, .Ltext2_start + .fake_p2align_4 + break 6 + .size .Ltext2_start, . - .Ltext2_start diff --git a/lld/test/ELF/loongarch-relax-emit-relocs.s b/lld/test/ELF/loongarch-relax-emit-relocs.s new file mode 100644 index 00000000000000..581fce8c95caa4 --- /dev/null +++ b/lld/test/ELF/loongarch-relax-emit-relocs.s @@ -0,0 +1,49 @@ +# REQUIRES: loongarch +## Test that we can handle --emit-relocs while relaxing. + +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax %s -o %t.32.o +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o +# RUN: ld.lld -Ttext=0x10000 --emit-relocs %t.32.o -o %t.32 +# RUN: ld.lld -Ttext=0x10000 --emit-relocs %t.64.o -o %t.64 +# RUN: llvm-objdump -dr %t.32 | FileCheck %s +# RUN: llvm-objdump -dr %t.64 | FileCheck %s + +## -r should keep original relocations. +# RUN: ld.lld -r %t.64.o -o %t.64.r +# RUN: llvm-objdump -dr %t.64.r | FileCheck %s --check-prefix=CHECKR + +## --no-relax should keep original relocations. +## TODO Due to R_LARCH_RELAX is not relaxed, it plays same as --relax now. +# RUN: ld.lld -Ttext=0x10000 --emit-relocs --no-relax %t.64.o -o %t.64.norelax +# RUN: llvm-objdump -dr %t.64.norelax | FileCheck %s + +# CHECK: 00010000 <_start>: +# CHECK-NEXT: pcalau12i $a0, 0 +# CHECK-NEXT: R_LARCH_PCALA_HI20 _start +# CHECK-NEXT: R_LARCH_RELAX *ABS* +# CHECK-NEXT: addi.{{[dw]}} $a0, $a0, 0 +# CHECK-NEXT: R_LARCH_PCALA_LO12 _start +# CHECK-NEXT: R_LARCH_RELAX *ABS* +# CHECK-NEXT: nop +# CHECK-NEXT: R_LARCH_ALIGN .Lla-relax-align0+0x4 +# CHECK-NEXT: nop +# CHECK-NEXT: ret + +# CHECKR: <_start>: +# CHECKR-NEXT: pcalau12i $a0, 0 +# CHECKR-NEXT: R_LARCH_PCALA_HI20 _start +# CHECKR-NEXT: R_LARCH_RELAX *ABS* +# CHECKR-NEXT: addi.d $a0, $a0, 0 +# CHECKR-NEXT: R_LARCH_PCALA_LO12 _start +# CHECKR-NEXT: R_LARCH_RELAX *ABS* +# CHECKR-NEXT: nop +# CHECKR-NEXT: R_LARCH_ALIGN .Lla-relax-align0+0x4 +# CHECKR-NEXT: nop +# CHECKR-NEXT: nop +# CHECKR-NEXT: ret + +.global _start +_start: + la.pcrel $a0, _start + .p2align 4 + ret diff --git a/lld/test/ELF/loongarch-reloc-leb128.s b/lld/test/ELF/loongarch-reloc-leb128.s new file mode 100644 index 00000000000000..2dd327d1564ebd --- /dev/null +++ b/lld/test/ELF/loongarch-reloc-leb128.s @@ -0,0 +1,102 @@ +# REQUIRES: loongarch +# RUN: rm -rf %t && split-file %s %t && cd %t + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax a.s -o a.o +# RUN: llvm-readobj -r -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a.o | FileCheck %s --check-prefix=REL +# RUN: ld.lld -shared --gc-sections a.o -o a.so +# RUN: llvm-readelf -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a.so | FileCheck %s + +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax a.s -o a32.o +# RUN: llvm-readobj -r -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a32.o | FileCheck %s --check-prefix=REL +# RUN: ld.lld -shared --gc-sections a32.o -o a32.so +# RUN: llvm-readelf -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a32.so | FileCheck %s + +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax extraspace.s -o extraspace32.o +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax extraspace.s -o extraspace64.o +# RUN: not ld.lld -shared extraspace32.o 2>&1 | FileCheck %s --check-prefix=ERROR +# RUN: not ld.lld -shared extraspace64.o 2>&1 | FileCheck %s --check-prefix=ERROR +# ERROR: error: extraspace{{.*}}.o:(.rodata+0x0): extra space for uleb128 + +#--- a.s +.cfi_startproc +.cfi_lsda 0x1b,.LLSDA0 +.cfi_endproc + +.section .text.w,"axR" +break 0; break 0; break 0; w1: + .p2align 4 # 4 bytes after relaxation +w2: break 0 + +.section .text.x,"ax" +break 0; break 0; break 0; x1: + .p2align 4 # 4 bytes after relaxation +x2: break 0 + +.section .gcc_except_table,"a" +.LLSDA0: +.uleb128 w2-w1+116 # initial value: 0x0080 +.uleb128 w1-w2+141 # initial value: 0x0080 +.uleb128 w2-w1+16372 # initial value: 0x008080 +.uleb128 w1-w2+16397 # initial value: 0x008080 +.uleb128 w2-w1+2097140 # initial value: 0x00808080 +.uleb128 w1-w2+2097165 # initial value: 0x00808080 + +.section .debug_rnglists +.uleb128 w2-w1+116 # initial value: 0x0080 +.uleb128 w1-w2+141 # initial value: 0x0080 +.uleb128 w2-w1+16372 # initial value: 0x008080 +.uleb128 w1-w2+16397 # initial value: 0x008080 +.uleb128 w2-w1+2097140 # initial value: 0x00808080 +.uleb128 w1-w2+2097165 # initial value: 0x00808080 + +.section .debug_loclists +.uleb128 x2-x1 # references discarded symbols + +# REL: Section ({{.*}}) .rela.debug_rnglists { +# REL-NEXT: 0x0 R_LARCH_ADD_ULEB128 w2 0x74 +# REL-NEXT: 0x0 R_LARCH_SUB_ULEB128 w1 0x0 +# REL-NEXT: 0x2 R_LARCH_ADD_ULEB128 w1 0x8D +# REL-NEXT: 0x2 R_LARCH_SUB_ULEB128 w2 0x0 +# REL-NEXT: 0x4 R_LARCH_ADD_ULEB128 w2 0x3FF4 +# REL-NEXT: 0x4 R_LARCH_SUB_ULEB128 w1 0x0 +# REL-NEXT: 0x7 R_LARCH_ADD_ULEB128 w1 0x400D +# REL-NEXT: 0x7 R_LARCH_SUB_ULEB128 w2 0x0 +# REL-NEXT: 0xA R_LARCH_ADD_ULEB128 w2 0x1FFFF4 +# REL-NEXT: 0xA R_LARCH_SUB_ULEB128 w1 0x0 +# REL-NEXT: 0xE R_LARCH_ADD_ULEB128 w1 0x20000D +# REL-NEXT: 0xE R_LARCH_SUB_ULEB128 w2 0x0 +# REL-NEXT: } +# REL: Section ({{.*}}) .rela.debug_loclists { +# REL-NEXT: 0x0 R_LARCH_ADD_ULEB128 x2 0x0 +# REL-NEXT: 0x0 R_LARCH_SUB_ULEB128 x1 0x0 +# REL-NEXT: } + +# REL: Hex dump of section '.gcc_except_table': +# REL-NEXT: 0x00000000 80008000 80800080 80008080 80008080 . +# REL-NEXT: 0x00000010 8000 . +# REL: Hex dump of section '.debug_rnglists': +# REL-NEXT: 0x00000000 80008000 80800080 80008080 80008080 . +# REL-NEXT: 0x00000010 8000 . +# REL: Hex dump of section '.debug_loclists': +# REL-NEXT: 0x00000000 00 . + +# CHECK: Hex dump of section '.gcc_except_table': +# CHECK-NEXT: 0x[[#%x,]] f8008901 f8ff0089 8001f8ff ff008980 . +# CHECK-NEXT: 0x[[#%x,]] 8001 . +# CHECK: Hex dump of section '.debug_rnglists': +# CHECK-NEXT: 0x00000000 f8008901 f8ff0089 8001f8ff ff008980 . +# CHECK-NEXT: 0x00000010 8001 . +# CHECK: Hex dump of section '.debug_loclists': +# CHECK-NEXT: 0x00000000 00 . + +#--- extraspace.s +.text +w1: + la.pcrel $t0, w1 +w2: + +.rodata +.reloc ., R_LARCH_ADD_ULEB128, w2 +.reloc ., R_LARCH_SUB_ULEB128, w1 +.fill 10, 1, 0x80 +.byte 1 diff --git a/lld/test/ELF/lto/resolution-err.ll b/lld/test/ELF/lto/resolution-err.ll index 6dfa64b1b8b9ee..f9855abaff3279 100644 --- a/lld/test/ELF/lto/resolution-err.ll +++ b/lld/test/ELF/lto/resolution-err.ll @@ -1,5 +1,5 @@ ; UNSUPPORTED: system-windows -; REQUIRES: shell +; REQUIRES: shell, non-root-user ; RUN: llvm-as %s -o %t.bc ; RUN: touch %t.resolution.txt ; RUN: chmod u-w %t.resolution.txt diff --git a/lld/test/ELF/lto/systemz.ll b/lld/test/ELF/lto/systemz.ll new file mode 100644 index 00000000000000..42bf4e32fb6d75 --- /dev/null +++ b/lld/test/ELF/lto/systemz.ll @@ -0,0 +1,18 @@ +; REQUIRES: systemz +;; Test we can infer the e_machine value EM_S390 from a bitcode file. + +; RUN: llvm-as %s -o %t.o +; RUN: ld.lld %t.o -o %t +; RUN: llvm-readobj -h %t | FileCheck %s + +; CHECK: Class: 64-bit +; CHECK: DataEncoding: BigEndian +; CHECK: Machine: EM_S390 + +target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" +target triple = "s390x-unknown-linux-gnu" + +define void @_start() { +entry: + ret void +} diff --git a/lld/test/ELF/lto/thinlto-cant-write-index.ll b/lld/test/ELF/lto/thinlto-cant-write-index.ll index e664acbb17de1a..286fcddd4238a1 100644 --- a/lld/test/ELF/lto/thinlto-cant-write-index.ll +++ b/lld/test/ELF/lto/thinlto-cant-write-index.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ; Basic ThinLTO tests. ; RUN: opt -module-summary %s -o %t1.o diff --git a/lld/test/ELF/lto/thinlto-emit-imports.ll b/lld/test/ELF/lto/thinlto-emit-imports.ll index 6d0e1e65047db4..253ec08619c982 100644 --- a/lld/test/ELF/lto/thinlto-emit-imports.ll +++ b/lld/test/ELF/lto/thinlto-emit-imports.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ;; Test a few properties not tested by thinlto-index-only.ll ; RUN: opt -module-summary %s -o %t1.o diff --git a/lld/test/ELF/mips-pc-relocs.s b/lld/test/ELF/mips-pc-relocs.s index 5e7dbed94ca7c4..7d23f9d7469a48 100644 --- a/lld/test/ELF/mips-pc-relocs.s +++ b/lld/test/ELF/mips-pc-relocs.s @@ -40,11 +40,13 @@ __start: # ^-- (0x20020-0x20000)>>2 # CHECK-NEXT: 20004: beqc $5, $6, 0x20020 # ^-- (0x20020-4-0x20004)>>2 -# CHECK-NEXT: 20008: beqzc $9, 0x20020 -# ^-- (0x20020-4-0x20008)>>2 -# CHECK-NEXT: 2000c: bc 0x20020 -# ^-- (0x20020-4-0x2000c)>>2 -# CHECK-NEXT: 20010: aluipc $2, 0 -# ^-- %hi(0x20020-0x20010) -# CHECK-NEXT: 20014: addiu $2, $2, 12 -# ^-- %lo(0x20020-0x20014) +# CHECK-NEXT: 20008: nop +# CHECK-NEXT: 2000c: beqzc $9, 0x20020 +# ^-- (0x20020-4-0x2000c)>>2 +# CHECK-NEXT: 20010: nop +# CHECK-NEXT: 20014: bc 0x20020 +# ^-- (0x20020-4-0x200014)>>2 +# CHECK-NEXT: 20018: aluipc $2, 0 +# ^-- %hi(0x20020-0x20018) +# CHECK-NEXT: 2001c: addiu $2, $2, 4 +# ^-- %lo(0x20020-0x2001c) diff --git a/lld/test/ELF/riscv-tlsdesc-gd-mixed.s b/lld/test/ELF/riscv-tlsdesc-gd-mixed.s new file mode 100644 index 00000000000000..c0e91593ed9686 --- /dev/null +++ b/lld/test/ELF/riscv-tlsdesc-gd-mixed.s @@ -0,0 +1,26 @@ +# REQUIRES: riscv +# RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.o +# RUN: ld.lld -shared %t.o -o %t.so +# RUN: llvm-readobj -r %t.so | FileCheck %s --check-prefix=RELA + +## Both TLSDESC and DTPMOD64/DTPREL64 should be present. +# RELA: .rela.dyn { +# RELA-NEXT: 0x[[#%X,ADDR:]] R_RISCV_TLSDESC a 0x0 +# RELA-NEXT: 0x[[#ADDR+16]] R_RISCV_TLS_DTPMOD64 a 0x0 +# RELA-NEXT: 0x[[#ADDR+24]] R_RISCV_TLS_DTPREL64 a 0x0 +# RELA-NEXT: } + + la.tls.gd a0,a + call __tls_get_addr@plt + +.Ltlsdesc_hi0: + auipc a2, %tlsdesc_hi(a) + ld a3, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a2) + addi a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi0) + jalr t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi0) + +.section .tbss,"awT",@nobits +.globl a +.zero 8 +a: +.zero 4 diff --git a/lld/test/ELF/riscv-tlsdesc-relax.s b/lld/test/ELF/riscv-tlsdesc-relax.s new file mode 100644 index 00000000000000..fb24317e6535ca --- /dev/null +++ b/lld/test/ELF/riscv-tlsdesc-relax.s @@ -0,0 +1,189 @@ +# REQUIRES: riscv +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym PAD=0 -mattr=+c,+relax a.s -o a.64.o +# RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym PAD=5000 -mattr=+c,+relax a.s -o aa.64.o +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c,+relax c.s -o c.64.o +# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so + +# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so -z separate-code +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.so | FileCheck %s --check-prefix=GD64 + +## Test the TLSDESC to LE optimization. Also check --emit-relocs. +# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le -z separate-code --emit-relocs +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -hdr a.64.le | FileCheck %s --check-prefix=LE64 +# RUN: ld.lld -e 0 -z now aa.64.o c.64.o -o aa.64.le -z separate-code +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d aa.64.le | FileCheck %s --check-prefix=LE64A + +## Test the TLSDESC to IE optimization. +# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie -z separate-code +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.ie | FileCheck %s --check-prefix=IE64 + +# GD64: .got 00000018 00000000000020c0 +# GD64-LABEL: <_start>: +# GD64-NEXT: jal {{.*}} +# GD64-LABEL: : +## &.got[c]-. = 0x20c0+8 - 0x1004 = 0x10c4 +# GD64: 1004: auipc a2, 0x1 +# GD64-NEXT: c.add a7, a7 +# GD64-NEXT: ld a3, 0xc4(a2) +# GD64-NEXT: c.add a7, a7 +# GD64-NEXT: addi a0, a2, 0xc4 +# GD64-NEXT: c.add a7, a7 +# GD64-NEXT: jalr t0, 0x0(a3) +# GD64-NEXT: c.add a0, tp +# GD64-NEXT: jal {{.*}} +## &.got[c]-. = 0x20c0+8 - 0x1020 = 0x10a8 +# GD64-NEXT: 1020: auipc a4, 0x1 +# GD64-NEXT: ld a5, 0xa8(a4) +# GD64-NEXT: addi a0, a4, 0xa8 +# GD64-NEXT: jalr t0, 0x0(a5) +# GD64-NEXT: c.add a0, tp +## &.got[c]-. = 0x20c0+8 - 0x1032 = 0x1096 +# GD64-NEXT: 1032: auipc a6, 0x1 +# GD64-NEXT: ld a7, 0x96(a6) +# GD64-NEXT: addi a0, a6, 0x96 +# GD64-NEXT: jalr t0, 0x0(a7) +# GD64-NEXT: c.add a0, tp + +# LE64-LABEL: <_start>: +# LE64-NEXT: jal {{.*}} +# LE64-LABEL: : +# LE64-NEXT: c.add a7, a7 +# LE64-NEXT: R_RISCV_TLSDESC_HI20 b +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: c.add a7, a7 +# LE64-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi0 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: 11008: c.add a7, a7 +# LE64-NEXT: R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi0 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: R_RISCV_TLSDESC_CALL .Ltlsdesc_hi0 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: c.add a0, tp +# LE64-NEXT: jal {{.*}} +# LE64-NEXT: R_RISCV_JAL foo +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: R_RISCV_TLSDESC_HI20 b +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi1 +# LE64-NEXT: R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi1 +# LE64-NEXT: R_RISCV_TLSDESC_CALL .Ltlsdesc_hi1 +# LE64-NEXT: c.add a0, tp +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: R_RISCV_TLSDESC_HI20 b +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi2 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi2 +# LE64-NEXT: R_RISCV_RELAX *ABS* +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: R_RISCV_TLSDESC_CALL .Ltlsdesc_hi2 +# LE64-NEXT: c.add a0, tp + +# LE64A-LABEL: <_start>: +# LE64A-NEXT: jal {{.*}} +# LE64A-LABEL: : +# LE64A-NEXT: c.add a7, a7 +# LE64A-NEXT: c.add a7, a7 +# LE64A-NEXT: 11008: lui a0, 0x2 +# LE64A-NEXT: c.add a7, a7 +# LE64A-NEXT: addi a0, a0, -0x479 +# LE64A-NEXT: c.add a0, tp +# LE64A-NEXT: jal {{.*}} +# LE64A-NEXT: lui a0, 0x2 +# LE64A-NEXT: addi a0, a0, -0x479 +# LE64A-NEXT: c.add a0, tp +# LE64A-NEXT: addi zero, zero, 0x0 +# LE64A-NEXT: addi zero, zero, 0x0 +# LE64A-NEXT: lui a0, 0x2 +# LE64A-NEXT: addi a0, a0, -0x479 +# LE64A-NEXT: c.add a0, tp + +# IE64: .got 00000010 00000000000120e0 +# IE64-LABEL: <_start>: +# IE64-NEXT: jal {{.*}} +# IE64-LABEL: : +# IE64-NEXT: c.add a7, a7 +# IE64-NEXT: c.add a7, a7 +## &.got[c]-. = 0x120e0+8 - 0x11008 = 0x10e0 +# IE64-NEXT: 11008: auipc a0, 0x1 +# IE64-NEXT: c.add a7, a7 +# IE64-NEXT: ld a0, 0xe0(a0) +# IE64-NEXT: c.add a0, tp +# IE64-NEXT: jal {{.*}} +## &.got[c]-. = 0x120e0+8 - 0x11018 = 0x10d0 +# IE64-NEXT: 11018: auipc a0, 0x1 +# IE64-NEXT: ld a0, 0xd0(a0) +# IE64-NEXT: c.add a0, tp +## &.got[c]-. = 0x120e0+8 - 0x1102a = 0x10be +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: 1102a: auipc a0, 0x1 +# IE64-NEXT: ld a0, 0xbe(a0) +# IE64-NEXT: c.add a0, tp + +#--- a.s +.globl _start +_start: +.balign 16 + call foo + +foo: +.Ltlsdesc_hi0: +.option norelax +## All 4 instructions have an R_RISCV_RELAX. +## Check that optimization/relaxation are not affected by irrelevant instructions. + auipc a2, %tlsdesc_hi(b) + .reloc .-4, R_RISCV_RELAX, 0 + c.add a7, a7 + ld a3, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a2) + .reloc .-4, R_RISCV_RELAX, 0 + c.add a7, a7 + addi a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi0) + .reloc .-4, R_RISCV_RELAX, 0 + c.add a7, a7 + jalr t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi0) + .reloc .-4, R_RISCV_RELAX, 0 + add a0, a0, tp +.option relax + + call foo + +.Ltlsdesc_hi1: +.option norelax +## AUIPC has an R_RISCV_RELAX. We perform relaxation, ignoring whether other +## instructions have R_RISCV_RELAX. + auipc a4, %tlsdesc_hi(b) + .reloc .-4, R_RISCV_RELAX, 0 + ld a5, %tlsdesc_load_lo(.Ltlsdesc_hi1)(a4) + addi a0, a4, %tlsdesc_add_lo(.Ltlsdesc_hi1) + jalr t0, 0(a5), %tlsdesc_call(.Ltlsdesc_hi1) + add a0, a0, tp +.option relax + +.Ltlsdesc_hi2: +.option norelax +## AUIPC does not have R_RISCV_RELAX. No relaxation. + auipc a6, %tlsdesc_hi(b) + ld a7, %tlsdesc_load_lo(.Ltlsdesc_hi2)(a6) + .reloc .-4, R_RISCV_RELAX, 0 + addi a0, a6, %tlsdesc_add_lo(.Ltlsdesc_hi2) + .reloc .-4, R_RISCV_RELAX, 0 + jalr t0, 0(a7), %tlsdesc_call(.Ltlsdesc_hi2) + add a0, a0, tp +.option relax + +.section .tbss +.globl a +.zero 8 +a: +.zero 2039+PAD ## Place b at 0x7ff+PAD + +#--- c.s +.tbss +.globl b +b: +.zero 4 diff --git a/lld/test/ELF/riscv-tlsdesc.s b/lld/test/ELF/riscv-tlsdesc.s new file mode 100644 index 00000000000000..1738f86256caa6 --- /dev/null +++ b/lld/test/ELF/riscv-tlsdesc.s @@ -0,0 +1,194 @@ +# REQUIRES: riscv +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=riscv64 a.s -o a.64.o +# RUN: llvm-mc -filetype=obj -triple=riscv64 c.s -o c.64.o +# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so +# RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym ELF32=1 a.s -o a.32.o +# RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym ELF32=1 c.s -o c.32.o +# RUN: ld.lld -shared -soname=c.32.so c.32.o -o c.32.so + +# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so +# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.so | FileCheck %s --check-prefix=GD64 + +# RUN: ld.lld -shared -z now a.64.o c.64.o -o rel.64.so -z rel +# RUN: llvm-readobj -r -x .got rel.64.so | FileCheck --check-prefix=GD64-REL %s + +# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le +# RUN: llvm-readelf -r a.64.le | FileCheck --check-prefix=NOREL %s +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.le | FileCheck %s --check-prefix=LE64 + +# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie +# RUN: llvm-readobj -r a.64.ie | FileCheck --check-prefix=IE64-RELA %s +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.ie | FileCheck %s --check-prefix=IE64 + +## 32-bit code is mostly the same. We only test a few variants. The IE optimization uses the LW instruction. + +# RUN: ld.lld -shared -z now a.32.o c.32.o -o rel.32.so -z rel +# RUN: llvm-readobj -r -x .got rel.32.so | FileCheck --check-prefix=GD32-REL %s +# RUN: ld.lld -e 0 -z now a.32.o c.32.so -o a.32.ie +# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.32.ie | FileCheck %s --check-prefix=IE32 + +# GD64-RELA: .rela.dyn { +# GD64-RELA-NEXT: 0x2408 R_RISCV_TLSDESC - 0x7FF +# GD64-RELA-NEXT: 0x23E8 R_RISCV_TLSDESC a 0x0 +# GD64-RELA-NEXT: 0x23F8 R_RISCV_TLSDESC c 0x0 +# GD64-RELA-NEXT: } +# GD64-RELA: Hex dump of section '.got': +# GD64-RELA-NEXT: 0x000023e0 20230000 00000000 00000000 00000000 # +# GD64-RELA-NEXT: 0x000023f0 00000000 00000000 00000000 00000000 . + +# GD64-REL: .rel.dyn { +# GD64-REL-NEXT: 0x23F0 R_RISCV_TLSDESC - +# GD64-REL-NEXT: 0x23D0 R_RISCV_TLSDESC a +# GD64-REL-NEXT: 0x23E0 R_RISCV_TLSDESC c +# GD64-REL-NEXT: } +# GD64-REL: Hex dump of section '.got': +# GD64-REL-NEXT: 0x000023c8 08230000 00000000 00000000 00000000 . +# GD64-REL-NEXT: 0x000023d8 00000000 00000000 00000000 00000000 . +# GD64-REL-NEXT: 0x000023e8 00000000 00000000 00000000 00000000 . +# GD64-REL-NEXT: 0x000023f8 ff070000 00000000 . + +# GD32-REL: .rel.dyn { +# GD32-REL-NEXT: 0x2274 R_RISCV_TLSDESC - +# GD32-REL-NEXT: 0x2264 R_RISCV_TLSDESC a +# GD32-REL-NEXT: 0x226C R_RISCV_TLSDESC c +# GD32-REL-NEXT: } +# GD32-REL: Hex dump of section '.got': +# GD32-REL-NEXT: 0x00002260 00220000 00000000 00000000 00000000 . +# GD32-REL-NEXT: 0x00002270 00000000 00000000 ff070000 . + +# GD64: .got 00000038 00000000000023e0 + +## &.got[a]-. = 0x23e0+8 - 0x12e0 = 0x1108 +# GD64: 12e0: auipc a0, 0x1 +# GD64-NEXT: ld a1, 0x108(a0) +# GD64-NEXT: addi a0, a0, 0x108 +# GD64-NEXT: jalr t0, 0x0(a1) +# GD64-NEXT: add a0, a0, tp + +## &.got[b]-. = 0x23e0+40 - 0x12f4 = 0x1114 +# GD64-NEXT: 12f4: auipc a2, 0x1 +# GD64-NEXT: ld a3, 0x114(a2) +# GD64-NEXT: addi a0, a2, 0x114 +# GD64-NEXT: jalr t0, 0x0(a3) +# GD64-NEXT: add a0, a0, tp + +## &.got[c]-. = 0x23e0+24 - 0x1308 = 0x10f0 +# GD64-NEXT: 1308: auipc a4, 0x1 +# GD64-NEXT: ld a5, 0xf0(a4) +# GD64-NEXT: addi a0, a4, 0xf0 +# GD64-NEXT: jalr t0, 0x0(a5) +# GD64-NEXT: add a0, a0, tp + +# NOREL: no relocations + +# LE64-LABEL: <.text>: +## st_value(a) = 8 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi a0, zero, 0x8 +# LE64-NEXT: add a0, a0, tp +## st_value(b) = 2047 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi a0, zero, 0x7ff +# LE64-NEXT: add a0, a0, tp +## st_value(c) = 2048 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: addi zero, zero, 0x0 +# LE64-NEXT: lui a0, 0x1 +# LE64-NEXT: addi a0, a0, -0x800 +# LE64-NEXT: add a0, a0, tp + +# IE64-RELA: .rela.dyn { +# IE64-RELA-NEXT: 0x123B0 R_RISCV_TLS_TPREL64 c 0x0 +# IE64-RELA-NEXT: } + +# IE64: .got 00000010 00000000000123a8 + +## a and b are optimized to use LE. c is optimized to IE. +# IE64-LABEL: <.text>: +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi a0, zero, 0x8 +# IE64-NEXT: add a0, a0, tp +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi a0, zero, 0x7ff +# IE64-NEXT: add a0, a0, tp +## &.got[c]-. = 0x123a8+8 - 0x112b8 = 0x10f8 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: addi zero, zero, 0x0 +# IE64-NEXT: 112b8: auipc a0, 0x1 +# IE64-NEXT: ld a0, 0xf8(a0) +# IE64-NEXT: add a0, a0, tp + +# IE32: .got 00000008 00012248 + +# IE32-LABEL: <.text>: +## st_value(a) = 8 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi a0, zero, 0x8 +# IE32-NEXT: add a0, a0, tp +## st_value(b) = 2047 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi a0, zero, 0x7ff +# IE32-NEXT: add a0, a0, tp +## &.got[c]-. = 0x12248+4 - 0x111cc = 0x1080 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: addi zero, zero, 0x0 +# IE32-NEXT: 111cc: auipc a0, 0x1 +# IE32-NEXT: lw a0, 0x80(a0) +# IE32-NEXT: add a0, a0, tp + +#--- a.s +.macro load dst, src +.ifdef ELF32 +lw \dst, \src +.else +ld \dst, \src +.endif +.endm + +.Ltlsdesc_hi0: + auipc a0, %tlsdesc_hi(a) + load a1, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a0) + addi a0, a0, %tlsdesc_add_lo(.Ltlsdesc_hi0) + jalr t0, 0(a1), %tlsdesc_call(.Ltlsdesc_hi0) + add a0, a0, tp + +.Ltlsdesc_hi1: + auipc a2, %tlsdesc_hi(b) + load a3, %tlsdesc_load_lo(.Ltlsdesc_hi1)(a2) + addi a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi1) + jalr t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi1) + add a0, a0, tp + +.Ltlsdesc_hi2: + auipc a4, %tlsdesc_hi(c) + load a5, %tlsdesc_load_lo(.Ltlsdesc_hi2)(a4) + addi a0, a4, %tlsdesc_add_lo(.Ltlsdesc_hi2) + jalr t0, 0(a5), %tlsdesc_call(.Ltlsdesc_hi2) + add a0, a0, tp + +.section .tbss +.globl a +.zero 8 +a: +.zero 2039 ## Place b at 0x7ff +b: +.zero 1 + +#--- c.s +.tbss +.globl c +c: .zero 4 diff --git a/lld/test/ELF/systemz-got.s b/lld/test/ELF/systemz-got.s new file mode 100644 index 00000000000000..1d558aa3b02905 --- /dev/null +++ b/lld/test/ELF/systemz-got.s @@ -0,0 +1,16 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/shared.s -o %t2.o +# RUN: ld.lld -shared %t2.o -soname=%t2.so -o %t2.so + +# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o %t2.so -o %t +# RUN: llvm-readelf -S -r %t | FileCheck %s + +# CHECK: .got PROGBITS {{.*}} {{.*}} 000020 00 WA 0 0 8 + +# CHECK: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries: +# CHECK: {{.*}} 000000010000000a R_390_GLOB_DAT 0000000000000000 bar + 0 + +.global _start +_start: + lgrl %r1,bar@GOT diff --git a/lld/test/ELF/systemz-gotent-relax-align.s b/lld/test/ELF/systemz-gotent-relax-align.s new file mode 100644 index 00000000000000..c6326086f56db0 --- /dev/null +++ b/lld/test/ELF/systemz-gotent-relax-align.s @@ -0,0 +1,48 @@ +# REQUIRES: systemz +## Verify that R_390_GOTENT optimization is not performed on misaligned symbols. + +# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t1 +# RUN: llvm-readelf -S -r -x .got -x .got.plt %t1 | FileCheck --check-prefixes=CHECK %s +# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s + +## We retain one .got entry for the unaligned symbol. +# CHECK: Name Type Address Off Size ES Flg Lk Inf Al +# CHECK: .got PROGBITS 00000000010021e0 0001e0 000020 00 WA 0 0 8 +# CHECK-NEXT: .relro_padding NOBITS 0000000001002200 000200 000e00 00 WA 0 0 1 +# CHECK-NEXT: .data PROGBITS 0000000001003200 000200 000006 00 WA 0 0 2 + +# CHECK-LABEL: Hex dump of section '.got': +# CHECK-NEXT: 0x010021e0 00000000 00000000 00000000 00000000 +# CHECK-NEXT: 0x010021f0 00000000 00000000 00000000 01003205 + +# DISASM: Disassembly of section .text: +# DISASM: <_start>: +# DISASM-NEXT: larl %r1, 0x1003200 +# DISASM-NEXT: larl %r1, 0x1003200 +# DISASM-NEXT: lgrl %r1, 0x10021f8 +# DISASM-NEXT: lgrl %r1, 0x10021f8 + +.data +.globl var_align +.hidden var_align + .align 2 +var_align: + .long 0 + +.data +.globl var_unalign +.hidden var_unalign + .align 2 + .byte 0 +var_unalign: + .byte 0 + +.text +.globl _start +.type _start, @function +_start: + lgrl %r1, var_align@GOT + lgrl %r1, var_align@GOT + lgrl %r1, var_unalign@GOT + lgrl %r1, var_unalign@GOT diff --git a/lld/test/ELF/systemz-gotent-relax-und-dso.s b/lld/test/ELF/systemz-gotent-relax-und-dso.s new file mode 100644 index 00000000000000..57369a417fd445 --- /dev/null +++ b/lld/test/ELF/systemz-gotent-relax-und-dso.s @@ -0,0 +1,68 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o +# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %S/Inputs/gotpc-relax-und-dso.s -o %tdso.o +# RUN: ld.lld -shared %tdso.o -soname=t.so -o %t.so +# RUN: ld.lld --hash-style=sysv -shared %t.o %t.so -o %t +# RUN: llvm-readelf -r %t | FileCheck --check-prefix=RELOC %s +# RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck --check-prefix=DISASM %s + +# RELOC-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 3 entries: +# RELOC: 00000000000023f8 000000010000000a R_390_GLOB_DAT 00000000000012d8 foo + 0 +# RELOC: 0000000000002400 000000030000000a R_390_GLOB_DAT 0000000000000000 und + 0 +# RELOC: 0000000000002408 000000040000000a R_390_GLOB_DAT 0000000000000000 dsofoo + 0 + +# DISASM: Disassembly of section .text: +# DISASM-EMPTY: +# DISASM-NEXT: : +# DISASM-NEXT: bc 0, 0 +# DISASM: : +# DISASM-NEXT: bc 0, 0 +# DISASM: <_start>: +# DISASM-NEXT: lgrl %r1, 0x2400 +# DISASM-NEXT: lgrl %r1, 0x2400 +# DISASM-NEXT: lgrl %r1, 0x2408 +# DISASM-NEXT: lgrl %r1, 0x2408 +# DISASM-NEXT: larl %r1, 0x12dc +# DISASM-NEXT: larl %r1, 0x12dc +# DISASM-NEXT: lgrl %r1, 0x23f8 +# DISASM-NEXT: lgrl %r1, 0x23f8 +# DISASM-NEXT: lgrl %r1, 0x2400 +# DISASM-NEXT: lgrl %r1, 0x2400 +# DISASM-NEXT: lgrl %r1, 0x2408 +# DISASM-NEXT: lgrl %r1, 0x2408 +# DISASM-NEXT: larl %r1, 0x12dc +# DISASM-NEXT: larl %r1, 0x12dc +# DISASM-NEXT: lgrl %r1, 0x23f8 +# DISASM-NEXT: lgrl %r1, 0x23f8 + +.text +.globl foo +.type foo, @function +foo: + nop + +.globl hid +.hidden hid +.type hid, @function +hid: + nop + +.globl _start +.type _start, @function +_start: + lgrl %r1, und@GOT + lgrl %r1, und@GOT + lgrl %r1, dsofoo@GOT + lgrl %r1, dsofoo@GOT + lgrl %r1, hid@GOT + lgrl %r1, hid@GOT + lgrl %r1, foo@GOT + lgrl %r1, foo@GOT + lgrl %r1, und@GOT + lgrl %r1, und@GOT + lgrl %r1, dsofoo@GOT + lgrl %r1, dsofoo@GOT + lgrl %r1, hid@GOT + lgrl %r1, hid@GOT + lgrl %r1, foo@GOT + lgrl %r1, foo@GOT diff --git a/lld/test/ELF/systemz-gotent-relax.s b/lld/test/ELF/systemz-gotent-relax.s new file mode 100644 index 00000000000000..f665e1af9e53d2 --- /dev/null +++ b/lld/test/ELF/systemz-gotent-relax.s @@ -0,0 +1,91 @@ +# REQUIRES: systemz +## Test R_390_GOTENT optimization. + +# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t1 --no-apply-dynamic-relocs +# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,NOAPPLY %s +# RUN: ld.lld %t.o -o %t1 --apply-dynamic-relocs +# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,APPLY %s +# RUN: ld.lld %t.o -o %t1 +# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s + +## --no-relax disables GOT optimization. +# RUN: ld.lld --no-relax %t.o -o %t2 +# RUN: llvm-objdump --no-print-imm-hex -d %t2 | FileCheck --check-prefix=NORELAX %s + +## In our implementation, .got is retained even if all GOT-generating relocations are optimized. +# CHECK: Name Type Address Off Size ES Flg Lk Inf Al +# CHECK: .iplt PROGBITS 0000000001001240 000240 000020 00 AX 0 0 16 +# CHECK-NEXT: .got PROGBITS 0000000001002260 000260 000018 00 WA 0 0 8 +# CHECK-NEXT: .relro_padding NOBITS 0000000001002278 000278 000d88 00 WA 0 0 1 +# CHECK-NEXT: .got.plt PROGBITS 0000000001003278 000278 000008 00 WA 0 0 8 + +## There is one R_S390_IRELATIVE relocation. +# CHECK-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries: +# CHECK: 0000000001003278 000000000000003d R_390_IRELATIVE 10011e8 + +# CHECK-LABEL: Hex dump of section '.got.plt': +# NOAPPLY-NEXT: 0x01003278 00000000 00000000 +# APPLY-NEXT: 0x01003278 00000000 010011e8 + +# DISASM: Disassembly of section .text: +# DISASM: 00000000010011e0 : +# DISASM-NEXT: bc 0, 0 +# DISASM: 00000000010011e4 : +# DISASM-NEXT: bc 0, 0 +# DISASM: 00000000010011e8 : +# DISASM-NEXT: br %r14 +# DISASM: 00000000010011ea <_start>: +# DISASM-NEXT: larl %r1, 0x10011e0 +# DISASM-NEXT: larl %r1, 0x10011e0 +# DISASM-NEXT: larl %r1, 0x10011e4 +# DISASM-NEXT: larl %r1, 0x10011e4 +# DISASM-NEXT: lgrl %r1, 0x1003278 +# DISASM-NEXT: lgrl %r1, 0x1003278 +# DISASM-NEXT: larl %r1, 0x10011e0 +# DISASM-NEXT: larl %r1, 0x10011e0 +# DISASM-NEXT: larl %r1, 0x10011e4 +# DISASM-NEXT: larl %r1, 0x10011e4 +# DISASM-NEXT: lgrl %r1, 0x1003278 +# DISASM-NEXT: lgrl %r1, 0x1003278 + +# NORELAX-LABEL: <_start>: +# NORELAX-COUNT-12: lgrl + +.text +.globl foo + +.text +.globl foo +.type foo, @function +foo: + nop + +.globl hid +.hidden hid +.type hid, @function +hid: + nop + +.text +.type ifunc STT_GNU_IFUNC +.globl ifunc +.type ifunc, @function +ifunc: + br %r14 + +.globl _start +.type _start, @function +_start: + lgrl %r1, foo@GOT + lgrl %r1, foo@GOT + lgrl %r1, hid@GOT + lgrl %r1, hid@GOT + lgrl %r1, ifunc@GOT + lgrl %r1, ifunc@GOT + lgrl %r1, foo@GOT + lgrl %r1, foo@GOT + lgrl %r1, hid@GOT + lgrl %r1, hid@GOT + lgrl %r1, ifunc@GOT + lgrl %r1, ifunc@GOT diff --git a/lld/test/ELF/systemz-ifunc-nonpreemptible.s b/lld/test/ELF/systemz-ifunc-nonpreemptible.s new file mode 100644 index 00000000000000..5056db302ca1c7 --- /dev/null +++ b/lld/test/ELF/systemz-ifunc-nonpreemptible.s @@ -0,0 +1,75 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-none-linux-gnu %s -o %t.o +# RUN: ld.lld -static %t.o -o %t +# RUN: ld.lld -static %t.o -o %t.apply --apply-dynamic-relocs +# RUN: llvm-readelf --section-headers --relocations --symbols %t | FileCheck %s +# RUN: llvm-readelf -x .got.plt %t | FileCheck %s --check-prefix=NO-APPLY-RELOC +# RUN: llvm-readelf -x .got.plt %t.apply | FileCheck %s --check-prefix=APPLY-RELOC +# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t | FileCheck %s --check-prefix=DISASM + +# CHECK: Section Headers: +# CHECK-NEXT: [Nr] Name Type Address Off Size ES Flg Lk Inf Al +# CHECK-NEXT: [ 0] NULL 0000000000000000 000000 000000 00 0 0 0 +# CHECK-NEXT: [ 1] .rela.dyn RELA 0000000001000158 000158 000030 18 AI 0 4 8 +# CHECK-NEXT: [ 2] .text PROGBITS 0000000001001188 000188 00001c 00 AX 0 0 4 +# CHECK-NEXT: [ 3] .iplt PROGBITS 00000000010011b0 0001b0 000040 00 AX 0 0 16 +# CHECK-NEXT: [ 4] .got.plt PROGBITS 00000000010021f0 0001f0 000010 00 WA 0 0 8 + +# CHECK: Relocation section '.rela.dyn' at offset 0x158 contains 2 entries: +# CHECK-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend +# CHECK-NEXT: 00000000010021f0 000000000000003d R_390_IRELATIVE 1001188 +# CHECK-NEXT: 00000000010021f8 000000000000003d R_390_IRELATIVE 100118a + +# CHECK: Symbol table '.symtab' contains 6 entries: +# CHECK-NEXT: Num: Value Size Type Bind Vis Ndx Name +# CHECK-NEXT: 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND +# CHECK-NEXT: 1: 0000000001000158 0 NOTYPE LOCAL HIDDEN 1 __rela_iplt_start +# CHECK-NEXT: 2: 0000000001000188 0 NOTYPE LOCAL HIDDEN 1 __rela_iplt_end +# CHECK-NEXT: 3: 0000000001001188 0 IFUNC GLOBAL DEFAULT 2 foo +# CHECK-NEXT: 4: 000000000100118a 0 IFUNC GLOBAL DEFAULT 2 bar +# CHECK-NEXT: 5: 000000000100118c 0 NOTYPE GLOBAL DEFAULT 2 _start + +# NO-APPLY-RELOC-LABEL: Hex dump of section '.got.plt': +# NO-APPLY-RELOC-NEXT: 0x010021f0 00000000 00000000 00000000 00000000 +# NO-APPLY-RELOC-EMPTY: + +# APPLY-RELOC-LABEL: Hex dump of section '.got.plt': +# APPLY-RELOC-NEXT: 0x010021f0 00000000 01001188 00000000 0100118a +# APPLY-RELOC-EMPTY: + +# DISASM: Disassembly of section .text: +# DISASM: 0000000001001188 : +# DISASM-NEXT: br %r14 +# DISASM: 000000000100118a : +# DISASM-NEXT: br %r14 +# DISASM: 000000000100118c <_start>: +# DISASM-NEXT: brasl %r14, 0x10011b0 +# DISASM-NEXT: brasl %r14, 0x10011d0 +# DISASM-NEXT: larl %r2, 0x1000158 +# DISASM-NEXT: larl %r2, 0x1000188 +# DISASM: Disassembly of section .iplt: +# DISASM: <.iplt>: +# DISASM: 10011b0: larl %r1, 0x10021f0 +# DISASM-NEXT: 10011b6: lg %r1, 0(%r1) +# DISASM-NEXT: 10011bc: br %r1 +# DISASM: 10011d0: larl %r1, 0x10021f8 +# DISASM-NEXT: 10011d6: lg %r1, 0(%r1) +# DISASM-NEXT: 10011dc: br %r1 + +.text +.type foo STT_GNU_IFUNC +.globl foo +foo: + br %r14 + +.type bar STT_GNU_IFUNC +.globl bar +bar: + br %r14 + +.globl _start +_start: + brasl %r14, foo@plt + brasl %r14, bar@plt + larl %r2, __rela_iplt_start + larl %r2, __rela_iplt_end diff --git a/lld/test/ELF/systemz-init-padding.s b/lld/test/ELF/systemz-init-padding.s new file mode 100644 index 00000000000000..c56b98d43f1b0e --- /dev/null +++ b/lld/test/ELF/systemz-init-padding.s @@ -0,0 +1,27 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/systemz-init.s -o systemz-init.o +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o systemz-init.o -o %t +# RUN: llvm-objdump -d --no-show-raw-insn -j .init %t | FileCheck %s + +# glibc < 2.39 used to align .init and .fini code at a 4-byte boundary. +# When that happens, the linker must not pad the code with invalid +# instructions, e.g. null bytes. + .section .init,"ax",@progbits + brasl %r14, startup + +# CHECK: <.init>: +# CHECK-NEXT: brasl %r14, +# CHECK-NEXT: bcr 0, %r7 +# CHECK-NEXT: lg %r4, 272(%r15) + + .text + .globl startup + .p2align 4 +startup: + br %r14 + + .globl main + .p2align 4 +main: + br %r14 diff --git a/lld/test/ELF/systemz-pie.s b/lld/test/ELF/systemz-pie.s new file mode 100644 index 00000000000000..bb971a82fb8ced --- /dev/null +++ b/lld/test/ELF/systemz-pie.s @@ -0,0 +1,38 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t1.o + +## Check -pie. +# RUN: ld.lld -pie %t1.o -o %t +# RUN: llvm-readelf --file-headers --program-headers --dynamic %t | FileCheck %s + +# CHECK: ELF Header: +# CHECK-NEXT: Magic: 7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00 +# CHECK-NEXT: Class: ELF64 +# CHECK-NEXT: Data: 2's complement, big endian +# CHECK-NEXT: Version: 1 (current) +# CHECK-NEXT: OS/ABI: UNIX - System V +# CHECK-NEXT: ABI Version: 0 +# CHECK-NEXT: Type: DYN (Shared object file) +# CHECK-NEXT: Machine: IBM S/390 +# CHECK-NEXT: Version: 0x1 + +# CHECK: Program Headers: +# CHECK-NEXT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# CHECK-NEXT: PHDR 0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R 0x8 +# CHECK-NEXT: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x00020d 0x00020d R 0x1000 +# CHECK-NEXT: LOAD 0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 RW 0x1000 +# CHECK-NEXT: DYNAMIC 0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000090 RW 0x8 +# CHECK-NEXT: GNU_RELRO 0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 R 0x1 +# CHECK-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW 0x0 + +# CHECK: Dynamic section at offset 0x210 contains 9 entries: +# CHECK-NEXT: Tag Type Name/Value +# CHECK-NEXT: 0x000000006ffffffb (FLAGS_1) PIE + +## Check -nopie +# RUN: ld.lld -no-pie %t1.o -o %t2 +# RUN: llvm-readelf --file-headers %t2 | FileCheck %s --check-prefix=NOPIE +# NOPIE-NOT: Type: DYN + +.globl _start +_start: diff --git a/lld/test/ELF/systemz-plt.s b/lld/test/ELF/systemz-plt.s new file mode 100644 index 00000000000000..4669f01f588121 --- /dev/null +++ b/lld/test/ELF/systemz-plt.s @@ -0,0 +1,83 @@ +# REQUIRES: systemz +# RUN: echo '.globl bar, weak; .type bar,@function; .type weak,@function; bar: weak:' > %t1.s + +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t1.s -o %t1.o +# RUN: ld.lld -shared %t1.o -soname=t1.so -o %t1.so +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld %t.o %t1.so -z separate-code -o %t +# RUN: llvm-readelf -S -s -r -x .got.plt %t | FileCheck %s +# RUN: llvm-objdump -d %t | FileCheck --check-prefixes=DIS %s + +# CHECK: Section Headers: +# CHECK: .plt PROGBITS 0000000001001020 001020 000060 00 AX 0 0 16 +# CHECK: .got PROGBITS 00000000010020d0 0020d0 000018 00 WA 0 0 8 +# CHECK: .got.plt PROGBITS 00000000010030e8 0020e8 000010 00 WA 0 0 8 + +# CHECK: Relocation section '.rela.plt' at offset {{.*}} contains 2 entries: +# CHECK: 00000000010030e8 000000010000000b R_390_JMP_SLOT 0000000000000000 bar + 0 +# CHECK: 00000000010030f0 000000020000000b R_390_JMP_SLOT 0000000000000000 weak + 0 + +## A canonical PLT has a non-zero st_value. bar and weak are called but their +## addresses are not taken, so a canonical PLT is not necessary. +# CHECK: Symbol table '.dynsym' contains 3 entries: +# CHECK-NEXT: Num: Value Size Type Bind Vis Ndx Name +# CHECK-NEXT: 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND +# CHECK-NEXT: 1: 0000000000000000 0 FUNC GLOBAL DEFAULT UND bar +# CHECK-NEXT: 2: 0000000000000000 0 FUNC WEAK DEFAULT UND weak + +## The .got.plt slots relocated by .rela.plt point to .plt +## This is required by glibc. +# CHECK: Hex dump of section '.got.plt': +# CHECK-NEXT: 0x010030e8 00000000 0100104e 00000000 0100106e + +# DIS: Disassembly of section .text: + +# DIS: 0000000001001000 <_start>: +# DIS-NEXT: brasl %r14, 0x1001012 +# DIS-NEXT: brasl %r14, 0x1001040 +# DIS-NEXT: brasl %r14, 0x1001060 + +# DIS: 0000000001001012 : +# DIS-NEXT: br %r14 + +# DIS: Disassembly of section .plt: + +# DIS: 0000000001001020 <.plt>: +# DIS-NEXT: 1001020: e3 10 f0 38 00 24 stg %r1, 56(%r15) +# DIS-NEXT: 1001026: c0 10 00 00 08 55 larl %r1, 0x10020d0 +# DIS-NEXT: 100102c: d2 07 f0 30 10 08 mvc 48(8,%r15), 8(%r1) +# DIS-NEXT: 1001032: e3 10 10 10 00 04 lg %r1, 16(%r1) +# DIS-NEXT: 1001038: 07 f1 br %r1 +# DIS-NEXT: 100103a: 07 00 bcr 0, %r0 +# DIS-NEXT: 100103c: 07 00 bcr 0, %r0 +# DIS-NEXT: 100103e: 07 00 bcr 0, %r0 +# DIS-NEXT: 1001040: c0 10 00 00 10 54 larl %r1, 0x10030e8 +# DIS-NEXT: 1001046: e3 10 10 00 00 04 lg %r1, 0(%r1) +# DIS-NEXT: 100104c: 07 f1 br %r1 +# DIS-NEXT: 100104e: 0d 10 basr %r1, 0 +# DIS-NEXT: 1001050: e3 10 10 0c 00 14 lgf %r1, 12(%r1) +# DIS-NEXT: 1001056: c0 f4 ff ff ff e5 jg 0x1001020 +# DIS-NEXT: 100105c: 00 00 +# DIS-NEXT: 100105e: 00 00 +# DIS-NEXT: 1001060: c0 10 00 00 10 48 larl %r1, 0x10030f0 +# DIS-NEXT: 1001066: e3 10 10 00 00 04 lg %r1, 0(%r1) +# DIS-NEXT: 100106c: 07 f1 br %r1 +# DIS-NEXT: 100106e: 0d 10 basr %r1, 0 +# DIS-NEXT: 1001070: e3 10 10 0c 00 14 lgf %r1, 12(%r1) +# DIS-NEXT: 1001076: c0 f4 ff ff ff d5 jg 0x1001020 +# DIS-NEXT: 100107c: 00 00 +# DIS-NEXT: 100107e: 00 18 + +.global _start, foo, bar +.weak weak + +_start: + ## Use @plt to avoid generating direct references that would force + ## allocation of a canonical PLT entry. + brasl %r14, foo@plt + brasl %r14, bar@plt + brasl %r14, weak@plt + +## foo is local and non-preemptable, no PLT is generated. +foo: + br %r14 diff --git a/lld/test/ELF/systemz-reloc-abs.s b/lld/test/ELF/systemz-reloc-abs.s new file mode 100644 index 00000000000000..b5ad94d90d3a96 --- /dev/null +++ b/lld/test/ELF/systemz-reloc-abs.s @@ -0,0 +1,32 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x %s -o %t.o +# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs255.s -o %t255.o +# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs256.s -o %t256.o +# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs257.s -o %t257.o + +# RUN: ld.lld %t.o %t256.o -o %t +# RUN: llvm-readelf -x .data %t | FileCheck %s +# CHECK: 0x{{[0-9a-f]+}} ff80ffff 8000ffff ffff8000 0000ffff +# CHECK-NEXT: ffffffff ffff8000 00000000 0000 + +# RUN: not ld.lld %t.o %t255.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW1 %s +# OVERFLOW1: relocation R_390_8 out of range: -129 is not in [-128, 255] +# OVERFLOW1: relocation R_390_16 out of range: -32769 is not in [-32768, 65535] +# OVERFLOW1: relocation R_390_32 out of range: -2147483649 is not in [-2147483648, 4294967295] + +# RUN: not ld.lld %t.o %t257.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW2 %s +# OVERFLOW2: relocation R_390_8 out of range: 256 is not in [-128, 255] +# OVERFLOW2: relocation R_390_16 out of range: 65536 is not in [-32768, 65535] +# OVERFLOW2: relocation R_390_32 out of range: 4294967296 is not in [-2147483648, 4294967295] + +.globl _start +_start: +.data +.byte foo - 1 +.byte foo - 384 +.word foo + 0xfeff +.word foo - 0x8100 +.long foo + 0xfffffeff +.long foo - 0x80000100 +.quad foo + 0xfffffffffffffeff +.quad foo - 0x8000000000000100 diff --git a/lld/test/ELF/systemz-reloc-disp12.s b/lld/test/ELF/systemz-reloc-disp12.s new file mode 100644 index 00000000000000..3d32707d149fe7 --- /dev/null +++ b/lld/test/ELF/systemz-reloc-disp12.s @@ -0,0 +1,21 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=291 %s -o %t1.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4095 %s -o %t2.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4096 %s -o %t3.o + +# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out +# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out +# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE + +# RANGE: relocation R_390_12 out of range: 4096 is not in [0, 4095] + +# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN=58678123 --check-prefix DUMP +# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN=58678fff --check-prefix DUMP + +# DUMP: 0x00000000 [[INSN]] + +.text +.globl _start +_start: + .reloc .+2, R_390_12, DISP + l %r6, 0(%r7,%r8) diff --git a/lld/test/ELF/systemz-reloc-disp20.s b/lld/test/ELF/systemz-reloc-disp20.s new file mode 100644 index 00000000000000..88cd657c6ae3cb --- /dev/null +++ b/lld/test/ELF/systemz-reloc-disp20.s @@ -0,0 +1,21 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=74565 %s -o %t1.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524287 %s -o %t2.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524288 %s -o %t3.o + +# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out +# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out +# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE + +# RANGE: relocation R_390_20 out of range: 524288 is not in [-524288, 524287] + +# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN="e3678345 1204" --check-prefix DUMP +# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN="e3678fff 7f04" --check-prefix DUMP + +# DUMP: 0x00000000 [[INSN]] + +.text +.globl _start +_start: + .reloc .+2, R_390_20, DISP + lg %r6, 0(%r7,%r8) diff --git a/lld/test/ELF/systemz-reloc-got.s b/lld/test/ELF/systemz-reloc-got.s new file mode 100644 index 00000000000000..4b9ac16481f4cb --- /dev/null +++ b/lld/test/ELF/systemz-reloc-got.s @@ -0,0 +1,92 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld -z norelro -shared %t.o -soname=t.so -o %t.so +## Note: Without norelro the distance between .got and .got.plt causes +## R_390_GOTPLT12 relocations to always overflow. + +# RUN: llvm-readelf -S -x .data %t.so | FileCheck %s +# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck %s --check-prefix=DISASM + +# CHECK: Section Headers: +# CHECK: .got PROGBITS 0000000000002458 +# CHECK: .got.plt PROGBITS 0000000000002480 + +## Note: _GLOBAL_OFFSET_TABLE is at .got +## GOT (foo) is at .got + 24 == 0x2470 +## GOT (bar) is at .got + 32 == 0x2478 +## GOTPLT (foo) is at .got.plt + 0 == .got + 40 == 0x2480 +## GOTPLT (bar) is at .got.plt + 8 == .got + 48 == 0x2488 + +# DISASM: larl %r12, 0x2458 +# DISASM-NEXT: larl %r1, 0x2470 +# DISASM-NEXT: larl %r1, 0x2478 +# DISASM-NEXT: larl %r1, 0x2480 +# DISASM-NEXT: larl %r1, 0x2488 + +# DISASM-NEXT: l %r1, 24(%r12) +# DISASM-NEXT: l %r1, 32(%r12) +# DISASM-NEXT: l %r1, 40(%r12) +# DISASM-NEXT: l %r1, 48(%r12) +# DISASM-NEXT: lg %r1, 24(%r12) +# DISASM-NEXT: lg %r1, 32(%r12) +# DISASM-NEXT: lg %r1, 40(%r12) +# DISASM-NEXT: lg %r1, 48(%r12) + +# CHECK: Hex dump of section '.data': +# CHECK-NEXT: 00180020 00280030 00000018 00000020 +# CHECK-NEXT: 00000028 00000030 00000000 00000018 +# CHECK-NEXT: 00000000 00000020 00000000 00000028 +# CHECK-NEXT: 00000000 00000030 + +.text +larl %r12, _GLOBAL_OFFSET_TABLE_ +.reloc .+2, R_390_GOTENT, foo+2 +larl %r1, 0 +.reloc .+2, R_390_GOTENT, bar+2 +larl %r1, 0 +.reloc .+2, R_390_GOTPLTENT, foo+2 +larl %r1, 0 +.reloc .+2, R_390_GOTPLTENT, bar+2 +larl %r1, 0 +.reloc .+2, R_390_GOT12, foo +l %r1, 0(%r12) +.reloc .+2, R_390_GOT12, bar +l %r1, 0(%r12) +.reloc .+2, R_390_GOTPLT12, foo +l %r1, 0(%r12) +.reloc .+2, R_390_GOTPLT12, bar +l %r1, 0(%r12) +.reloc .+2, R_390_GOT20, foo +lg %r1, 0(%r12) +.reloc .+2, R_390_GOT20, bar +lg %r1, 0(%r12) +.reloc .+2, R_390_GOTPLT20, foo +lg %r1, 0(%r12) +.reloc .+2, R_390_GOTPLT20, bar +lg %r1, 0(%r12) + +.data +.reloc ., R_390_GOT16, foo +.space 2 +.reloc ., R_390_GOT16, bar +.space 2 +.reloc ., R_390_GOTPLT16, foo +.space 2 +.reloc ., R_390_GOTPLT16, bar +.space 2 +.reloc ., R_390_GOT32, foo +.space 4 +.reloc ., R_390_GOT32, bar +.space 4 +.reloc ., R_390_GOTPLT32, foo +.space 4 +.reloc ., R_390_GOTPLT32, bar +.space 4 +.reloc ., R_390_GOT64, foo +.space 8 +.reloc ., R_390_GOT64, bar +.space 8 +.reloc ., R_390_GOTPLT64, foo +.space 8 +.reloc ., R_390_GOTPLT64, bar +.space 8 diff --git a/lld/test/ELF/systemz-reloc-gotrel.s b/lld/test/ELF/systemz-reloc-gotrel.s new file mode 100644 index 00000000000000..46669ecfa7fd01 --- /dev/null +++ b/lld/test/ELF/systemz-reloc-gotrel.s @@ -0,0 +1,36 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: ld.lld -shared %t.o -soname=t.so -o %t.so + +# RUN: llvm-readelf -S -s -x .data %t.so | FileCheck %s + +# CHECK: Section Headers: +# CHECK: .plt PROGBITS 0000000000001290 +# CHECK: .got PROGBITS 0000000000002390 + +# CHECK: Symbol table '.symtab' +# CHECK: 0000000000001288 {{.*}} bar + +## Note: foo is the first (and only) PLT entry, which resides at .plt + 32 +## PLTOFF (foo) is (.plt + 32) - .got == 0x12b0 - 0x2390 == 0xffffef20 +## GOTOFF (bar) is bar - .got == 0x1288 - 0x2390 == 0xffffeef8 +# CHECK: Hex dump of section '.data': +# CHECK-NEXT: eef8ef20 ffffeef8 ffffef20 ffffffff +# CHECK-NEXT: ffffeef8 ffffffff ffffef20 + +bar: + br %r14 + +.data +.reloc ., R_390_GOTOFF16, bar +.space 2 +.reloc ., R_390_PLTOFF16, foo +.space 2 +.reloc ., R_390_GOTOFF, bar +.space 4 +.reloc ., R_390_PLTOFF32, foo +.space 4 +.reloc ., R_390_GOTOFF64, bar +.space 8 +.reloc ., R_390_PLTOFF64, foo +.space 8 diff --git a/lld/test/ELF/systemz-reloc-pc16.s b/lld/test/ELF/systemz-reloc-pc16.s new file mode 100644 index 00000000000000..e1dad5af239d45 --- /dev/null +++ b/lld/test/ELF/systemz-reloc-pc16.s @@ -0,0 +1,39 @@ +# REQUIRES: systemz +# RUN: rm -rf %t && split-file %s %t + +## Check recompile with -fPIC error message +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o +# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s + +# CHECK: error: relocation R_390_PC16 cannot be used against symbol '_shared'; recompile with -fPIC +# CHECK: >>> defined in {{.*}} +# CHECK: >>> referenced by {{.*}}:(.data+0x1) + +## Check patching of negative addends + +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32768 %t/addend.s -o %t/2.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32769 %t/addend.s -o %t/3.o + +# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out +# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out +# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE + +# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC16 out of range + +# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffff --check-prefix DUMP +# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=8000 --check-prefix DUMP + +# DUMP: 0x00000000 [[ADDEND]] + +#--- shared.s +.data + .byte 0xe8 + .word _shared - . + +#--- addend.s +.text +.globl _start +_start: + .reloc ., R_390_PC16, .text-ADDEND + .space 2 diff --git a/lld/test/ELF/systemz-reloc-pc32.s b/lld/test/ELF/systemz-reloc-pc32.s new file mode 100644 index 00000000000000..0cb9322eb1c1b9 --- /dev/null +++ b/lld/test/ELF/systemz-reloc-pc32.s @@ -0,0 +1,39 @@ +# REQUIRES: systemz +# RUN: rm -rf %t && split-file %s %t + +## Check recompile with -fPIC error message +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o +# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s + +# CHECK: error: relocation R_390_PC32 cannot be used against symbol '_shared'; recompile with -fPIC +# CHECK: >>> defined in {{.*}} +# CHECK: >>> referenced by {{.*}}:(.data+0x1) + +## Check patching of negative addends + +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483648 %t/addend.s -o %t/2.o +# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483649 %t/addend.s -o %t/3.o + +# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out +# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out +# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE + +# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC32 out of range + +# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffffffff --check-prefix DUMP +# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=80000000 --check-prefix DUMP + +# DUMP: 0x00000000 [[ADDEND]] + +#--- shared.s +.data + .byte 0xe8 + .long _shared - . + +#--- addend.s +.text +.globl _start +_start: + .reloc ., R_390_PC32, .text-ADDEND + .space 4 diff --git a/lld/test/ELF/systemz-reloc-pcdbl.s b/lld/test/ELF/systemz-reloc-pcdbl.s new file mode 100644 index 00000000000000..faee756f5e95b4 --- /dev/null +++ b/lld/test/ELF/systemz-reloc-pcdbl.s @@ -0,0 +1,68 @@ +# REQUIRES: systemz + +# RUN: llvm-mc --filetype=obj --triple=s390x-unknown-linux -mcpu=z13 %s -o %t.o + +# RUN: ld.lld %t.o --defsym foo16=pc16dbl+4 --defsym bar16=pc16dbl --defsym foo32=pc32dbl+6 --defsym bar32=pc32dbl --defsym foo12=pc12dbl+6 --defsym bar12=pc12dbl --defsym foo24=pc24dbl+6 --defsym bar24=pc24dbl -o %t +# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t | FileCheck %s --check-prefix=CHECK +# CHECK: 0000000001001120 : +# CHECK: je 0x1001124 +# CHECK: jne 0x1001120 +# CHECK: 0000000001001128 : +# CHECK: jge 0x100112e +# CHECK: jgne 0x1001128 +# CHECK: 0000000001001134 : +# CHECK: bprp 5, 0x100113a, 0x1001134 +# CHECK: bprp 6, 0x1001134, 0x100113a +# CHECK: 0000000001001140 : +# CHECK: bprp 5, 0x1001140, 0x1001146 +# CHECK: bprp 6, 0x1001146, 0x1001140 + +# RUN: ld.lld %t.o --defsym foo16=pc16dbl+0xfffe --defsym bar16=pc16dbl+4-0x10000 --defsym foo32=pc32dbl+0xfffffffe --defsym bar32=pc32dbl+6-0x100000000 --defsym foo12=pc12dbl+0xffe --defsym bar12=pc12dbl+6-0x1000 --defsym foo24=pc24dbl+0xfffffe --defsym bar24=pc24dbl+6-0x1000000 -o %t.limits +# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t.limits | FileCheck %s --check-prefix=LIMITS +# LIMITS: je 0x101111e +# LIMITS-NEXT: jne 0xff1124 +# LIMITS: jge 0x101001126 +# LIMITS-NEXT: jgne 0xffffffff0100112e +# LIMITS: bprp 5, 0x1002132, 0x1001134 +# LIMITS-NEXT: bprp 6, 0x100013a, 0x100113a +# LIMITS: bprp 5, 0x1001140, 0x200113e +# LIMITS-NEXT: bprp 6, 0x1001146, 0x1146 + +# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+0x10000 --defsym bar16=pc16dbl+4-0x10002 --defsym foo32=pc32dbl+0x100000000 --defsym bar32=pc32dbl+6-0x100000002 --defsym foo12=pc12dbl+0x1000 --defsym bar12=pc12dbl+6-0x1002 --defsym foo24=pc24dbl+0x1000000 --defsym bar24=pc24dbl+6-0x1000002 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-RANGE %s +# ERROR-RANGE: error: [[FILE]]:(.text+0x2): relocation R_390_PC16DBL out of range: 65536 is not in [-65536, 65535]; references 'foo16' +# ERROR-RANGE: error: [[FILE]]:(.text+0x6): relocation R_390_PC16DBL out of range: -65538 is not in [-65536, 65535]; references 'bar16' +# ERROR-RANGE: error: [[FILE]]:(.text+0xa): relocation R_390_PC32DBL out of range: 4294967296 is not in [-4294967296, 4294967295]; references 'foo32' +# ERROR-RANGE: error: [[FILE]]:(.text+0x10): relocation R_390_PC32DBL out of range: -4294967298 is not in [-4294967296, 4294967295]; references 'bar32' +# ERROR-RANGE: error: [[FILE]]:(.text+0x15): relocation R_390_PC12DBL out of range: 4096 is not in [-4096, 4095]; references 'foo12' +# ERROR-RANGE: error: [[FILE]]:(.text+0x1b): relocation R_390_PC12DBL out of range: -4098 is not in [-4096, 4095]; references 'bar12' +# ERROR-RANGE: error: [[FILE]]:(.text+0x23): relocation R_390_PC24DBL out of range: 16777216 is not in [-16777216, 16777215]; references 'foo24' +# ERROR-RANGE: error: [[FILE]]:(.text+0x29): relocation R_390_PC24DBL out of range: -16777218 is not in [-16777216, 16777215]; references 'bar24' + +# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+1 --defsym bar16=pc16dbl-1 --defsym foo32=pc32dbl+1 --defsym bar32=pc32dbl-1 --defsym foo12=pc12dbl+1 --defsym bar12=pc12dbl-1 --defsym foo24=pc24dbl+1 --defsym bar24=pc24dbl-1 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-ALIGN %s +# ERROR-ALIGN: error: [[FILE]]:(.text+0x2): improper alignment for relocation R_390_PC16DBL: 0x1 is not aligned to 2 bytes +# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x6): improper alignment for relocation R_390_PC16DBL: 0xFFFFFFFFFFFFFFFB is not aligned to 2 bytes +# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0xa): improper alignment for relocation R_390_PC32DBL: 0x1 is not aligned to 2 bytes +# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x10): improper alignment for relocation R_390_PC32DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes +# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x15): improper alignment for relocation R_390_PC12DBL: 0x1 is not aligned to 2 bytes +# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x1b): improper alignment for relocation R_390_PC12DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes +# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x23): improper alignment for relocation R_390_PC24DBL: 0x1 is not aligned to 2 bytes +# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x29): improper alignment for relocation R_390_PC24DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes + +.global _start +.global pc16dbl +.global pc32dbl +.global pc12dbl +.global pc24dbl +_start: +pc16dbl: + je foo16 + jne bar16 +pc32dbl: + jge foo32 + jgne bar32 +pc12dbl: + bprp 5,foo12,0 + bprp 6,bar12,0 +pc24dbl: + bprp 5,0,foo24 + bprp 6,0,bar24 diff --git a/lld/test/ELF/systemz-tls-gd.s b/lld/test/ELF/systemz-tls-gd.s new file mode 100644 index 00000000000000..3976f55a6ae39e --- /dev/null +++ b/lld/test/ELF/systemz-tls-gd.s @@ -0,0 +1,142 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o +# RUN: echo '.tbss; .globl b, c; b: .zero 4; c:' | llvm-mc -filetype=obj -triple=s390x-unknown-linux - -o %t1.o +# RUN: ld.lld -shared -soname=t1.so %t1.o -o %t1.so + +# RUN: ld.lld -shared %t.o %t1.o -o %t.so +# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=GD-REL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=GD %s +# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=GD-DATA %s + +# RUN: ld.lld %t.o %t1.o -o %t.le +# RUN: llvm-readelf -r %t.le | FileCheck --check-prefix=NOREL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t.le | FileCheck --check-prefix=LE %s +# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.le | FileCheck --check-prefix=LE-DATA %s + +# RUN: ld.lld %t.o %t1.so -o %t.ie +# RUN: llvm-readelf -r %t.ie | FileCheck --check-prefix=IE-REL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t.ie | FileCheck --check-prefix=IE %s +# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.ie | FileCheck --check-prefix=IE-DATA %s + +# GD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 6 entries: +# GD-REL: 0000000000002570 0000000200000036 R_390_TLS_DTPMOD 0000000000000008 a + 0 +# GD-REL-NEXT: 0000000000002578 0000000200000037 R_390_TLS_DTPOFF 0000000000000008 a + 0 +# GD-REL-NEXT: 0000000000002580 0000000300000036 R_390_TLS_DTPMOD 000000000000000c b + 0 +# GD-REL-NEXT: 0000000000002588 0000000300000037 R_390_TLS_DTPOFF 000000000000000c b + 0 +# GD-REL-NEXT: 0000000000002590 0000000400000036 R_390_TLS_DTPMOD 0000000000000010 c + 0 +# GD-REL-NEXT: 0000000000002598 0000000400000037 R_390_TLS_DTPOFF 0000000000000010 c + 0 + +## _GLOBAL_OFFSET_TABLE is at 0x2558 +# GD: larl %r12, 0x2558 + +## GOT offset of the TLS module ID / offset pair for a is at 0x2460 +# GD-NEXT: lgrl %r2, 0x2460 +# GD-NEXT: brasl %r14, 0x1440 +# GD-NEXT: lgf %r2, 0(%r2,%r7) + +## GOT offset of the TLS module ID / offset pair for b is at 0x2468 +# GD-NEXT: lgrl %r2, 0x2468 +# GD-NEXT: brasl %r14, 0x1440 +# GD-NEXT: lgf %r2, 0(%r2,%r7) + +## GOT offset of the TLS module ID / offset pair for c is at 0x2470 +# GD-NEXT: lgrl %r2, 0x2470 +# GD-NEXT: brasl %r14, 0x1440 +# GD-NEXT: lgf %r2, 0(%r2,%r7) + +## Constant pool holding GOT offsets of TLS module ID / offset pairs: +# a: 0x2570 / 0x18 +# b: 0x2580 / 0x28 +# c: 0x2590 / 0x38 +# GD-DATA: 2460 00000000 00000018 00000000 00000028 +# GD-DATA-NEXT: 2470 00000000 00000038 + +# NOREL: no relocations + +## _GLOBAL_OFFSET_TABLE is at 0x1002230 +# LE: larl %r12, 0x1002230 + +## TP offset of a is at 0x1002218 +# LE-NEXT: lgrl %r2, 0x1002218 +# LE-NEXT: brcl 0, +# LE-NEXT: lgf %r2, 0(%r2,%r7) + +## TP offset of b is at 0x1002220 +# LE-NEXT: lgrl %r2, 0x1002220 +# LE-NEXT: brcl 0, +# LE-NEXT: lgf %r2, 0(%r2,%r7) + +## TP offset of c is at 0x1002228 +# LE-NEXT: lgrl %r2, 0x1002228 +# LE-NEXT: brcl 0, +# LE-NEXT: lgf %r2, 0(%r2,%r7) + +## TP offsets +# a: -8 +# b: -4 +# c: 0 +# LE-DATA: 1002218 ffffffff fffffff8 ffffffff fffffffc +# LE-DATA-NEXT: 1002228 00000000 00000000 + + +# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 2 entries: +# IE-REL: 0000000001002430 0000000200000038 R_390_TLS_TPOFF 0000000000000000 b + 0 +# IE-REL-NEXT: 0000000001002438 0000000300000038 R_390_TLS_TPOFF 0000000000000000 c + 0 + +## _GLOBAL_OFFSET_TABLE is at 0x1002418 +# IE: larl %r12, 0x1002418 + +## TP offset of a is at 0x1002340 +# IE-NEXT: lgrl %r2, 0x1002340 +# IE-NEXT: brcl 0, +# IE-NEXT: lgf %r2, 0(%r2,%r7) + +## GOT offset of the TP offset for b is at 0x1002348 +# IE-NEXT: lgrl %r2, 0x1002348 +# IE-NEXT: lg %r2, 0(%r2,%r12) +# IE-NEXT: lgf %r2, 0(%r2,%r7) + +## GOT offset of the TP offset for c is at 0x1002350 +# IE-NEXT: lgrl %r2, 0x1002350 +# IE-NEXT: lg %r2, 0(%r2,%r12) +# IE-NEXT: lgf %r2, 0(%r2,%r7) + +## TP offsets (a) / GOT offset of TP offsets (b, c) +# a: -4 +# b: 0x1002430 / 0x18 +# c: 0x1002438 / 0x20 +# IE-DATA: 1002340 ffffffff fffffffc 00000000 00000018 +# IE-DATA-NEXT: 1002350 00000000 00000020 + + +ear %r7,%a0 +sllg %r7,%r1,32 +ear %r7,%a1 +larl %r12,_GLOBAL_OFFSET_TABLE_ + +lgrl %r2,.LC0 +brasl %r14,__tls_get_offset@PLT:tls_gdcall:a +lgf %r2,0(%r2,%r7) + +lgrl %r2,.LC1 +brasl %r14,__tls_get_offset@PLT:tls_gdcall:b +lgf %r2,0(%r2,%r7) + +lgrl %r2,.LC2 +brasl %r14,__tls_get_offset@PLT:tls_gdcall:c +lgf %r2,0(%r2,%r7) + + .section .data.rel.ro,"aw" + .align 8 +.LC0: + .quad a@TLSGD +.LC1: + .quad b@TLSGD +.LC2: + .quad c@TLSGD + + .section .tbss + .globl a + .zero 8 +a: + .zero 4 diff --git a/lld/test/ELF/systemz-tls-ie.s b/lld/test/ELF/systemz-tls-ie.s new file mode 100644 index 00000000000000..85e2f24cb61f62 --- /dev/null +++ b/lld/test/ELF/systemz-tls-ie.s @@ -0,0 +1,121 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o + +# RUN: ld.lld -shared %t.o -o %t.so +# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=IE-REL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=IE %s +# RUN: llvm-objdump --section .data --full-contents %t.so | FileCheck --check-prefix=IE-DATA %s + +# RUN: ld.lld %t.o -o %t +# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s +# RUN: llvm-objdump --section .data --full-contents %t | FileCheck --check-prefix=LE-DATA %s +# RUN: llvm-objdump --section .got --full-contents %t | FileCheck --check-prefix=LE-GOT %s + +## With -pie we still have the R_390_RELATIVE for the data element, but all GOT +## entries should be fully resolved without any remaining R_390_TLS_TPOFF. +# RUN: ld.lld -pie %t.o -o %t.pie +# RUN: llvm-readelf -r %t.pie | FileCheck --check-prefix=PIE-REL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t.pie | FileCheck --check-prefix=PIE %s +# RUN: llvm-objdump --section .data --full-contents %t.pie | FileCheck --check-prefix=PIE-DATA %s +# RUN: llvm-objdump --section .got --full-contents %t.pie | FileCheck --check-prefix=PIE-GOT %s + +# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 4 entries: +# IE-REL: 0000000000003478 000000000000000c R_390_RELATIVE 2460 +# IE-REL: 0000000000002460 0000000100000038 R_390_TLS_TPOFF 0000000000000008 a + 0 +# IE-REL: 0000000000002468 0000000200000038 R_390_TLS_TPOFF 000000000000000c b + 0 +# IE-REL: 0000000000002470 0000000300000038 R_390_TLS_TPOFF 0000000000000010 c + 0 + +## TP offset for a is at 0x2460 +# IE: lgrl %r1, 0x2460 +# IE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for b is at 0x2468 +# IE-NEXT: lgrl %r1, 0x2468 +# IE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for c is at 0x2470 +# IE-NEXT: lgrl %r1, 0x2470 +# IE-NEXT: lgf %r1, 0(%r1,%r7) + +## Data element: TP offset for a is at 0x2460 (relocated via R_390_RELATIVE above) +# IE-DATA: 3478 00000000 00000000 + +# NOREL: no relocations + +## TP offset for a is at 0x1002250 +# LE: lgrl %r1, 0x1002250 +# LE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for b is at 0x1002258 +# LE-NEXT: lgrl %r1, 0x1002258 +# LE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for c is at 0x1002260 +# LE-NEXT: lgrl %r1, 0x1002260 +# LE-NEXT: lgf %r1, 0(%r1,%r7) + +## Data element: TP offset for a is at 0x1002250 +# LE-DATA: 00000000 01002250 + +## TP offsets in GOT: +# a: -8 +# b: -4 +# c: 0 +# LE-GOT: 1002238 00000000 00000000 00000000 00000000 +# LE-GOT: 1002248 00000000 00000000 ffffffff fffffff8 +# LE-GOT: 1002258 ffffffff fffffffc 00000000 00000000 + +# PIE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries: +# PIE-REL: 00000000000033d0 000000000000000c R_390_RELATIVE 23b8 + +## TP offset for a is at 0x23b8 +# PIE: lgrl %r1, 0x23b8 +# PIE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for b is at 0x23c0 +# PIE-NEXT: lgrl %r1, 0x23c0 +# PIE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for c is at 0x23c8 +# PIE-NEXT: lgrl %r1, 0x23c8 +# PIE-NEXT: lgf %r1, 0(%r1,%r7) + +## Data element: TP offset for a is at 0x23b8 (relocated via R_390_RELATIVE above) +# PIE-DATA: 33d0 00000000 00000000 + +## TP offsets in GOT: +# a: -8 +# b: -4 +# c: 0 +# PIE-GOT: 23a0 00000000 000022d0 00000000 00000000 +# PIE-GOT: 23b0 00000000 00000000 ffffffff fffffff8 +# PIE-GOT: 23c0 ffffffff fffffffc 00000000 00000000 + +ear %r7,%a0 +sllg %r7,%r1,32 +ear %r7,%a1 + +lgrl %r1, a@indntpoff +lgf %r1,0(%r1,%r7) + +lgrl %r1, b@indntpoff +lgf %r1,0(%r1,%r7) + +lgrl %r1, c@indntpoff +lgf %r1,0(%r1,%r7) + + .data + .reloc .,R_390_TLS_IE64,a + .space 8 + + .section .tbss + .globl a + .globl b + .globl c + .zero 8 +a: + .zero 4 +b: + .zero 4 +c: diff --git a/lld/test/ELF/systemz-tls-ld.s b/lld/test/ELF/systemz-tls-ld.s new file mode 100644 index 00000000000000..2cb36d7294f2b0 --- /dev/null +++ b/lld/test/ELF/systemz-tls-ld.s @@ -0,0 +1,114 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o + +# RUN: ld.lld -shared %t.o -o %t.so +# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=LD-REL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=LD %s +# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=LD-DATA %s + +# RUN: ld.lld %t.o -o %t +# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s +# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s + +# LD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries: +# LD-REL: 00000000000024f8 0000000000000036 R_390_TLS_DTPMOD 0 + +## _GLOBAL_OFFSET_TABLE is at 0x24e0 +# LD: larl %r12, 0x24e0 + +## GOT offset of the LDM TLS module ID is at 0x23e0 +# LD-NEXT: lgrl %r2, 0x23e0 +# LD-NEXT: brasl %r14, 0x13c0 +# LD-NEXT: la %r2, 0(%r2,%r7) + +## DTP offset for a is at 0x23e8 +# LD-NEXT: lgrl %r1, 0x23e8 +# LD-NEXT: lgf %r1, 0(%r1,%r2) + +## DTP offset for b is at 0x23f0 +# LD-NEXT: lgrl %r1, 0x23f0 +# LD-NEXT: lgf %r1, 0(%r1,%r2) + +## DTP offset for c is at 0x23f8 +# LD-NEXT: lgrl %r1, 0x23f8 +# LD-NEXT: lgf %r1, 0(%r1,%r2) + +## Constant pool holding GOT offsets of TLS module ID and DTP offsets: +# TLS module ID: 0x24f8 / 0x18 +# a: 8 +# b: 12 +# c: 16 +# LD-DATA: 23e0 00000000 00000018 00000000 00000008 +# LD-DATA: 23f0 00000000 0000000c 00000000 00000010 + +# NOREL: no relocations + +## _GLOBAL_OFFSET_TABLE is at 0x1002230 +# LE: larl %r12, 0x1002230 + +## GOT offset of the LDM TLS module ID is at 0x1002210 +# LE-NEXT: lgrl %r2, 0x1002210 +# LE-NEXT: brcl 0, +# LE-NEXT: la %r2, 0(%r2,%r7) + +## TP offset for a is at 0x1002218 +# LE-NEXT: lgrl %r1, 0x1002218 +# LE-NEXT: lgf %r1, 0(%r1,%r2) + +## TP offset for b is at 0x1002220 +# LE-NEXT: lgrl %r1, 0x1002220 +# LE-NEXT: lgf %r1, 0(%r1,%r2) + +## TP offset for c is at 0x1002228 +# LE-NEXT: lgrl %r1, 0x1002228 +# LE-NEXT: lgf %r1, 0(%r1,%r2) + +## zeroed LDM / TP offsets: +# LDM TLS: 0 +# a: -8 +# b: -4 +# c: 0 +# LE-DATA: 1002210 00000000 00000000 ffffffff fffffff8 +# LE-DATA: 1002220 ffffffff fffffffc 00000000 00000000 + + +ear %r7,%a0 +sllg %r7,%r1,32 +ear %r7,%a1 +larl %r12,_GLOBAL_OFFSET_TABLE_ + +lgrl %r2,.LC0 +brasl %r14,__tls_get_offset@PLT:tls_ldcall:a +la %r2,0(%r2,%r7) + +lgrl %r1, .LC1 +lgf %r1,0(%r1,%r2) + +lgrl %r1, .LC2 +lgf %r1,0(%r1,%r2) + +lgrl %r1, .LC3 +lgf %r1,0(%r1,%r2) + + .section .data.rel.ro,"aw" + .align 8 +.LC0: + .quad a@TLSLDM +.LC1: + .quad a@DTPOFF +.LC2: + .quad b@DTPOFF +.LC3: + .quad c@DTPOFF + + .section .tbss + .globl a + .globl b + .globl c + .zero 8 +a: + .zero 4 +b: + .zero 4 +c: diff --git a/lld/test/ELF/systemz-tls-le.s b/lld/test/ELF/systemz-tls-le.s new file mode 100644 index 00000000000000..9e41fc768da391 --- /dev/null +++ b/lld/test/ELF/systemz-tls-le.s @@ -0,0 +1,61 @@ +# REQUIRES: systemz +# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o + +# RUN: ld.lld %t.o -o %t +# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s +# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s +# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s + +# NOREL: no relocations + +## TP offset for a is at 0x1002200 +# LE: lgrl %r1, 0x1002200 +# LE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for b is at 0x1002208 +# LE-NEXT: lgrl %r1, 0x1002208 +# LE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offset for c is at 0x1002210 +# LE-NEXT: lgrl %r1, 0x1002210 +# LE-NEXT: lgf %r1, 0(%r1,%r7) + +## TP offsets: +# a: -8 +# b: -4 +# c: 0 +# LE-DATA: 1002200 ffffffff fffffff8 ffffffff fffffffc +# LE-DATA: 1002210 00000000 00000000 + +ear %r7,%a0 +sllg %r7,%r1,32 +ear %r7,%a1 + +lgrl %r1, .LC0 +lgf %r1,0(%r1,%r7) + +lgrl %r1, .LC1 +lgf %r1,0(%r1,%r7) + +lgrl %r1, .LC2 +lgf %r1,0(%r1,%r7) + + .section .data.rel.ro,"aw" + .align 8 +.LC0: + .quad a@ntpoff +.LC1: + .quad b@ntpoff +.LC2: + .quad c@ntpoff + + .section .tbss + .globl a + .globl b + .globl c + .zero 8 +a: + .zero 4 +b: + .zero 4 +c: diff --git a/lld/test/ELF/warn-backrefs.s b/lld/test/ELF/warn-backrefs.s index 1e5c14ed052e66..453017eb1c8ec3 100644 --- a/lld/test/ELF/warn-backrefs.s +++ b/lld/test/ELF/warn-backrefs.s @@ -100,6 +100,10 @@ ## -u does not make a backward reference. # RUN: ld.lld --fatal-warnings --warn-backrefs -u foo %t2.a %t1.o -o /dev/null +## --defsym does not make a backward reference, but it does not suppress the warning due to another file. +# RUN: ld.lld --fatal-warnings --warn-backrefs --defsym=x=foo -e 0 %t2.a -o /dev/null +# RUN: ld.lld --warn-backrefs --defsym=x=foo %t2.a %t1.o -o /dev/null 2>&1 | FileCheck %s + # RUN: not ld.lld --warn-backrefs-exclude='[' 2>&1 | FileCheck --check-prefix=INVALID %s # INVALID: error: --warn-backrefs-exclude: invalid glob pattern, unmatched '[': [ diff --git a/lld/test/ELF/x86-64-gotpc-relax-too-far.s b/lld/test/ELF/x86-64-gotpc-relax-too-far.s index 74aa6d8f65a0d8..ba41faab67de5c 100644 --- a/lld/test/ELF/x86-64-gotpc-relax-too-far.s +++ b/lld/test/ELF/x86-64-gotpc-relax-too-far.s @@ -5,7 +5,10 @@ # RUN: llvm-objdump --no-print-imm-hex -d %t/bin | FileCheck --check-prefix=DISASM %s # RUN: llvm-readelf -S %t/bin | FileCheck --check-prefixes=GOT %s # RUN: ld.lld -T %t/lds2 %t/a.o -o %t/bin2 -# RUN: llvm-readelf -S %t/bin2 | FileCheck --check-prefixes=UNNECESSARY-GOT %s +# RUN: llvm-objdump --no-print-imm-hex -d %t/bin2 | FileCheck --check-prefix=DISASM %s +# RUN: llvm-readelf -S %t/bin2 | FileCheck --check-prefixes=GOT %s +# RUN: ld.lld -T %t/lds3 %t/a.o -o %t/bin3 +# RUN: llvm-readelf -S %t/bin3 | FileCheck --check-prefixes=UNNECESSARY-GOT %s # DISASM: <_foo>: # DISASM-NEXT: movl 2097146(%rip), %eax @@ -47,6 +50,13 @@ SECTIONS { data 0x80200000 : { *(data) } } #--- lds2 +SECTIONS { + .text.foo 0x100000 : { *(.text.foo) } + .text 0x1ff000 : { . = . + 0x1000 ; *(.text) } + .got 0x300000 : { *(.got) } + data 0x80200000 : { *(data) } +} +#--- lds3 SECTIONS { .text.foo 0x100000 : { *(.text.foo) } .text 0x200000 : { *(.text) } diff --git a/lld/test/MachO/invalid/invalid-lto-object-path.ll b/lld/test/MachO/invalid/invalid-lto-object-path.ll index 75c6a97e446fb2..c862538d592ce8 100644 --- a/lld/test/MachO/invalid/invalid-lto-object-path.ll +++ b/lld/test/MachO/invalid/invalid-lto-object-path.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ;; Creating read-only directories with `chmod 400` isn't supported on Windows ; UNSUPPORTED: system-windows diff --git a/lld/test/MachO/thinlto-emit-imports.ll b/lld/test/MachO/thinlto-emit-imports.ll index 47a612bd0a7b56..88f766f59c8877 100644 --- a/lld/test/MachO/thinlto-emit-imports.ll +++ b/lld/test/MachO/thinlto-emit-imports.ll @@ -1,4 +1,4 @@ -; REQUIRES: x86 +; REQUIRES: x86, non-root-user ; RUN: rm -rf %t; split-file %s %t ; Generate summary sections and test lld handling. diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test index 559a32bfa242f8..057de2a22f6a0c 100644 --- a/lld/test/MinGW/driver.test +++ b/lld/test/MinGW/driver.test @@ -409,6 +409,13 @@ LTO_OPTS: -mllvm:-mcpu=x86-64 -opt:lldlto=2 -dwodir:foo -lto-cs-profile-generate RUN: ld.lld -### foo.o -m i386pep --lto-O2 --lto-CGO1 --lto-cs-profile-generate --lto-cs-profile-file=foo 2>&1 | FileCheck -check-prefix=LTO_OPTS2 %s LTO_OPTS2:-opt:lldlto=2 -opt:lldltocgo=1 -lto-cs-profile-generate -lto-cs-profile-file:foo +RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s +RUN: ld.lld -### foo.o -m i386pe --lto-emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s +LTO_EMIT_ASM: -lldemit:asm + +RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-llvm 2>&1 | FileCheck -check-prefix=LTO_EMIT_LLVM %s +LTO_EMIT_LLVM: -lldemit:llvm + Test GCC specific LTO options that GCC passes unconditionally, that we ignore. RUN: ld.lld -### foo.o -m i386pep -plugin /usr/lib/gcc/x86_64-w64-mingw32/10-posix/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-w64-mingw32/10-posix/lto-wrapper -plugin-opt=-fresolution=/tmp/ccM9d4fP.res -plugin-opt=-pass-through=-lmingw32 2> /dev/null diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py index b3e07f1f823cc4..d309c2ad4ee284 100644 --- a/lld/test/lit.cfg.py +++ b/lld/test/lit.cfg.py @@ -83,6 +83,7 @@ "PowerPC": "ppc", "RISCV": "riscv", "Sparc": "sparc", + "SystemZ": "systemz", "WebAssembly": "wasm", "X86": "x86", }, diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 1d230004e6c34e..f82be164ac9c48 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -19,13 +19,13 @@ if(NOT DEFINED LLVM_VERSION_MAJOR) set(LLVM_VERSION_MAJOR 18) endif() if(NOT DEFINED LLVM_VERSION_MINOR) - set(LLVM_VERSION_MINOR 0) + set(LLVM_VERSION_MINOR 1) endif() if(NOT DEFINED LLVM_VERSION_PATCH) - set(LLVM_VERSION_PATCH 0) + set(LLVM_VERSION_PATCH 5) endif() if(NOT DEFINED LLVM_VERSION_SUFFIX) - set(LLVM_VERSION_SUFFIX git) + set(LLVM_VERSION_SUFFIX) endif() if (NOT PACKAGE_VERSION) @@ -35,7 +35,7 @@ endif() if(NOT DEFINED LLVM_SHLIB_SYMBOL_VERSION) # "Symbol version prefix for libLLVM.so" - set(LLVM_SHLIB_SYMBOL_VERSION "LLVM_${LLVM_VERSION_MAJOR}") + set(LLVM_SHLIB_SYMBOL_VERSION "LLVM_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}") endif() if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (MSVC_TOOLSET_VERSION LESS 142) AND (CMAKE_GENERATOR_TOOLSET STREQUAL "")) diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 5e989618552824..ceec15b611140d 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -108,7 +108,7 @@ function(add_llvm_symbol_exports target_name export_file) COMMAND "${Python3_EXECUTABLE}" "-c" "import sys; \ lines = [' ' + l.rstrip() for l in sys.stdin] + [' local: *;']; \ - print('LLVM_${LLVM_VERSION_MAJOR} {'); \ + print('LLVM_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} {'); \ print(' global:') if len(lines) > 1 else None; \ print(';\\n'.join(lines) + '\\n};')" < ${export_file} > ${native_export_file} @@ -646,9 +646,9 @@ function(llvm_add_library name) if(UNIX AND NOT APPLE AND NOT ARG_SONAME) set_target_properties(${name} PROPERTIES - # Since 4.0.0, the ABI version is indicated by the major version - SOVERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} - VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX}) + # Since 18.1.0, the ABI version is indicated by the major and minor version. + SOVERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}${LLVM_VERSION_SUFFIX} + VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}${LLVM_VERSION_SUFFIX}) endif() endif() @@ -2074,7 +2074,7 @@ function(add_lit_testsuites project directory) endfunction() function(llvm_install_library_symlink name dest type) - cmake_parse_arguments(ARG "" "COMPONENT" "" ${ARGN}) + cmake_parse_arguments(ARG "FULL_DEST" "COMPONENT" "" ${ARGN}) foreach(path ${CMAKE_MODULE_PATH}) if(EXISTS ${path}/LLVMInstallSymlink.cmake) set(INSTALL_SYMLINK ${path}/LLVMInstallSymlink.cmake) @@ -2088,7 +2088,11 @@ function(llvm_install_library_symlink name dest type) endif() set(full_name ${CMAKE_${type}_LIBRARY_PREFIX}${name}${CMAKE_${type}_LIBRARY_SUFFIX}) - set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX}) + if (ARG_FULL_DEST) + set(full_dest ${dest}) + else() + set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX}) + endif() if(LLVM_USE_SYMLINKS) set(LLVM_LINK_OR_COPY create_symlink) diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in index 74e1c6bf52e230..770a9caea322e6 100644 --- a/llvm/cmake/modules/LLVMConfig.cmake.in +++ b/llvm/cmake/modules/LLVMConfig.cmake.in @@ -90,6 +90,11 @@ if(LLVM_ENABLE_CURL) find_package(CURL) endif() +set(LLVM_ENABLE_HTTPLIB @LLVM_ENABLE_HTTPLIB@) +if(LLVM_ENABLE_HTTPLIB) + find_package(httplib) +endif() + set(LLVM_WITH_Z3 @LLVM_WITH_Z3@) set(LLVM_ENABLE_DIA_SDK @LLVM_ENABLE_DIA_SDK@) diff --git a/llvm/docs/AdvancedBuilds.rst b/llvm/docs/AdvancedBuilds.rst index 960b19fa5317f3..ee178dd3772c4b 100644 --- a/llvm/docs/AdvancedBuilds.rst +++ b/llvm/docs/AdvancedBuilds.rst @@ -145,6 +145,29 @@ that also enables ThinTLO, use the following command: -DPGO_INSTRUMENT_LTO=Thin \ /llvm +By default, clang will generate profile data by compiling a simple +hello world program. You can also tell clang use an external +project for generating profile data that may be a better fit for your +use case. The project you specify must either be a lit test suite +(use the CLANG_PGO_TRAINING_DATA option) or a CMake project (use the +CLANG_PERF_TRAINING_DATA_SOURCE_DIR option). + +For example, If you wanted to use the +`LLVM Test Suite `_ to generate +profile data you would use the following command: + +.. code-block:: console + + $ cmake -G Ninja -C /clang/cmake/caches/PGO.cmake \ + -DBOOTSTRAP_CLANG_PGO_TRAINING_DATA_SOURCE_DIR= \ + -DBOOTSTRAP_CLANG_PGO_TRAINING_DEPS=runtimes + +The BOOTSTRAP\_ prefixes tells CMake to pass the variables on to the instrumented +stage two build. And the CLANG_PGO_TRAINING_DEPS option let's you specify +additional build targets to build before building the external project. The +LLVM Test Suite requires compiler-rt to build, so we need to add the +`runtimes` target as a dependency. + After configuration, building the stage2-instrumented-generate-profdata target will automatically build the stage1 compiler, build the instrumented compiler with the stage1 compiler, and then run the instrumented compiler against the @@ -172,12 +195,12 @@ You can feed that file into the LLVM_PROFDATA_FILE option when you build your optimized compiler. It may be necessary to build additional targets before running perf training, such as -builtins and runtime libraries. You can use the :code:`CLANG_PERF_TRAINING_DEPS` CMake +builtins and runtime libraries. You can use the :code:`CLANG_PGO_TRAINING_DEPS` CMake variable for that purpose: .. code-block:: cmake - set(CLANG_PERF_TRAINING_DEPS builtins runtimes CACHE STRING "") + set(CLANG_PGO_TRAINING_DEPS builtins runtimes CACHE STRING "") The PGO cache has a slightly different stage naming scheme than other multi-stage builds. It generates three stages: stage1, stage2-instrumented, and diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst index 6e13cd94b92fda..01a96f2f4f770e 100644 --- a/llvm/docs/CommandGuide/llvm-objcopy.rst +++ b/llvm/docs/CommandGuide/llvm-objcopy.rst @@ -538,6 +538,10 @@ options. For GNU :program:`objcopy` compatibility, the values are all bfdnames. - `elf64-tradlittlemips` - `elf32-sparc` - `elf32-sparcel` +- `elf32-hexagon` +- `elf32-loongarch` +- `elf64-loongarch` +- `elf64-s390` Additionally, all targets except `binary` and `ihex` can have `-freebsd` as a suffix. diff --git a/llvm/docs/CommandGuide/llvm-readelf.rst b/llvm/docs/CommandGuide/llvm-readelf.rst index 6ee4a5dfb15917..675628fdda45ec 100644 --- a/llvm/docs/CommandGuide/llvm-readelf.rst +++ b/llvm/docs/CommandGuide/llvm-readelf.rst @@ -38,6 +38,11 @@ OPTIONS Display the contents of the basic block address map section(s), which contain the address of each function, along with the relative offset of each basic block. +.. option:: --decompress, -z + + Dump decompressed section content when used with ``-x`` or ``-p``. + If the section(s) are not compressed, they are displayed as is. + .. option:: --demangle, -C Display demangled symbol names in the output. diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst index cb9232ef5e560a..6d78a038723445 100644 --- a/llvm/docs/CommandGuide/llvm-readobj.rst +++ b/llvm/docs/CommandGuide/llvm-readobj.rst @@ -56,6 +56,11 @@ file formats. Display the address-significance table. +.. option:: --decompress, -z + + Dump decompressed section content when used with ``-x`` or ``-p``. + If the section(s) are not compressed, they are displayed as is. + .. option:: --expand-relocs When used with :option:`--relocs`, display each relocation in an expanded diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 7a7ddc59ba985d..74b0439da7fc58 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -5499,6 +5499,8 @@ RISC-V: Sparc: +- ``L``: Print the low-order register of a two-register operand. +- ``H``: Print the high-order register of a two-register operand. - ``r``: No effect. SystemZ: diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 6fdc945ad27078..ba292ea39c8a8a 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -130,6 +130,7 @@ on support follow. ``Zicclsm`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Ziccrse`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Zicntr`` (`See Note <#riscv-i2p1-note>`__) + ``Zicond`` Supported ``Zicsr`` (`See Note <#riscv-i2p1-note>`__) ``Zifencei`` (`See Note <#riscv-i2p1-note>`__) ``Zihintntl`` Supported @@ -234,9 +235,6 @@ The primary goal of experimental support is to assist in the process of ratifica ``experimental-zicfilp``, ``experimental-zicfiss`` LLVM implements the `0.4 draft specification `__. -``experimental-zicond`` - LLVM implements the `1.0-rc1 draft specification `__. - ``experimental-ztso`` LLVM implements the `v0.1 proposed specification `__ (see Chapter 25). The mapping from the C/C++ memory model to Ztso has not yet been ratified in any standards document. There are multiple possible mappings, and they are *not* mutually ABI compatible. The mapping LLVM implements is ABI compatible with the default WMO mapping. This mapping may change and there is *explicitly* no ABI stability offered while the extension remains in experimental status. User beware. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 7b6a3f10d63777..ff929b0bc5e15b 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -105,6 +105,14 @@ Changes to the AArch64 Backend Armv9.0a has the same features enabled as Armv8.5a, with the exception of crypto. +* Assembler/disassembler support has been added for 2023 architecture + extensions. + +* Support has been added for Stack Clash Protection. During function frame + creation and dynamic stack allocations, the compiler will issue memory + accesses at reguilar intervals so that a guard area at the top of the stack + can't be skipped over. + Changes to the AMDGPU Backend ----------------------------- @@ -156,6 +164,30 @@ Changes to the MIPS Backend Changes to the PowerPC Backend ------------------------------ +* LLJIT's JIT linker now defaults to JITLink on 64-bit ELFv2 targets. +* Initial-exec TLS model is supported on AIX. +* Implemented new resource based scheduling model of POWER7 and POWER8. +* ``frexp`` libcall now references correct symbol name for ``fp128``. +* Optimized materialization of 64-bit immediates, code generation of + ``vec_promote`` and atomics. +* Global constant strings are pooled in the TOC under one entry to reduce the + number of entries in the TOC. +* Added a number of missing Power10 extended mnemonics. +* Added the SCV instruction. +* Fixed register class for the paddi instruction. +* Optimize VPERM and fix code order for swapping vector operands on LE. +* Added various bug fixes and code gen improvements. + +AIX Support/improvements: + +* Support for a non-TOC-based access sequence for the local-exec TLS model (called small local-exec). +* XCOFF toc-data peephole optimization and bug fixes. +* Move less often used __ehinfo TOC entries to the end of the TOC section. +* Fixed problems when the AIX libunwind unwinds starting from a signal handler + and the function that raised the signal happens to be a leaf function that + shares the stack frame with its caller or a leaf function that does not store + the stack frame backchain. + Changes to the RISC-V Backend ----------------------------- @@ -181,6 +213,18 @@ Changes to the RISC-V Backend specification. * The Smepmp 1.0 extension is now supported. * ``-mcpu=sifive-p670`` was added. +* Support for the Zicond extension is no longer experimental. + +Changes to the SystemZ Backend +------------------------------ + +* Properly support 16 byte atomic int/fp types and ops. +* Support i128 as legal type in VRs. +* Add an i128 cost model. +* Support building individual functions with backchain using the + __attribute__((target("backchain"))) syntax. +* Add exception handling for XPLINK. +* Add support for llvm-objcopy. Changes to the WebAssembly Backend ---------------------------------- @@ -306,18 +350,44 @@ Changes to the Debug Info Changes to the LLVM tools --------------------------------- -* llvm-symbolizer now treats invalid input as an address for which source +* ``llvm-symbolizer`` now treats invalid input as an address for which source information is not found. -* llvm-readelf now supports ``--extra-sym-info`` (``-X``) to display extra +* Fixed big-endian support in ``llvm-symbolizer``'s DWARF location parser. +* ``llvm-readelf`` now supports ``--extra-sym-info`` (``-X``) to display extra information (section name) when showing symbols. +* ``llvm-readobj``/``llvm-readelf`` now supports ``--decompress``/``-z`` with + string and hex dump for ELF object files. + +* ``llvm-symbolizer`` and ``llvm-addr2line`` now support addresses specified as symbol names. + +* ``llvm-objcopy`` now supports ``--gap-fill`` and ``--pad-to`` options, for + ELF input and binary output files only. +* ``llvm-objcopy`` now supports ``-O elf64-s390`` for SystemZ. + +* Supported parsing XCOFF auxiliary symbols in ``obj2yaml``. +* ``llvm-ranlib`` now supports ``-X`` on AIX to specify the type of object file + ranlib should examine. + +* ``llvm-cxxfilt`` now supports ``--no-params``/``-p`` to skip function + parameters. + +* ``llvm-nm`` now supports ``--export-symbol`` to ignore the import symbol file. * ``llvm-nm`` now supports the ``--line-numbers`` (``-l``) option to use debugging information to print symbols' filenames and line numbers. -* llvm-symbolizer and llvm-addr2line now support addresses specified as symbol names. +* ``llvm-rc`` and ``llvm-windres`` now accept file path references in ``.rc`` files + concatenated from multiple string literals. -* llvm-objcopy now supports ``--gap-fill`` and ``--pad-to`` options, for - ELF input and binary output files only. +* The ``llvm-windres`` option ``--preprocessor`` now resolves its argument + in the ``PATH`` environment variable as expected, and options passed with + ``--preprocessor-arg`` are placed before the input file as they should + be. + +* The ``llvm-windres`` option ``--preprocessor`` has been updated with the + breaking behaviour change from GNU windres from binutils 2.36, where + the whole argument is considered as one path, not considered as a + sequence of tool name and parameters. Changes to LLDB --------------------------------- @@ -359,10 +429,25 @@ Changes to LLDB fields are present, however this is not always possible or entirely accurate. If in doubt, refer to the numerical value. +* On Windows, LLDB can now read the thread names. + Changes to Sanitizers --------------------- * HWASan now defaults to detecting use-after-scope bugs. +* `SpecialCaseList `_ + used by sanitizer ignore lists (e.g. ``*_ignorelist.txt`` in the Clang + resource directory) now uses glob patterns instead of a variant of POSIX + Extended Regular Expression (where ``*`` is translated to ``.*``) by default. + Search for ``|`` to find patterns that may have different meanings now, and + replace ``a|b`` with ``{a,b}``. + +Changes to the Profile Runtime +------------------------------ + +* Public header ``profile/instr_prof_interface.h`` is added to declare four + API functions to fine tune profile collection. + Other Changes ------------- diff --git a/llvm/include/llvm/ADT/iterator_range.h b/llvm/include/llvm/ADT/iterator_range.h index 2dc227935984b1..7d288ea4506ba5 100644 --- a/llvm/include/llvm/ADT/iterator_range.h +++ b/llvm/include/llvm/ADT/iterator_range.h @@ -43,8 +43,8 @@ class iterator_range { IteratorT begin_iterator, end_iterator; public: -#if __GNUC__ == 7 - // Be careful no to break gcc-7 on the mlir target. +#if __GNUC__ == 7 || (__GNUC__ == 8 && __GNUC_MINOR__ < 4) + // Be careful no to break gcc-7 and gcc-8 < 8.4 on the mlir target. // See https://github.com/llvm/llvm-project/issues/63843 template #else diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h index d6f732d35fd4cd..e8e4f491be5a3d 100644 --- a/llvm/include/llvm/Analysis/AliasAnalysis.h +++ b/llvm/include/llvm/Analysis/AliasAnalysis.h @@ -287,6 +287,10 @@ class AAQueryInfo { /// store %l, ... bool MayBeCrossIteration = false; + /// Whether alias analysis is allowed to use the dominator tree, for use by + /// passes that lazily update the DT while performing AA queries. + bool UseDominatorTree = true; + AAQueryInfo(AAResults &AAR, CaptureInfo *CI) : AAR(AAR), CI(CI) {} }; @@ -668,6 +672,9 @@ class BatchAAResults { void enableCrossIterationMode() { AAQI.MayBeCrossIteration = true; } + + /// Disable the use of the dominator tree during alias analysis queries. + void disableDominatorTree() { AAQI.UseDominatorTree = false; } }; /// Temporary typedef for legacy code that uses a generic \c AliasAnalysis diff --git a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h index afc1811239f283..7eca82729430dd 100644 --- a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h +++ b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h @@ -43,20 +43,26 @@ class BasicAAResult : public AAResultBase { const Function &F; const TargetLibraryInfo &TLI; AssumptionCache &AC; - DominatorTree *DT; + /// Use getDT() instead of accessing this member directly, in order to + /// respect the AAQI.UseDominatorTree option. + DominatorTree *DT_; + + DominatorTree *getDT(const AAQueryInfo &AAQI) const { + return AAQI.UseDominatorTree ? DT_ : nullptr; + } public: BasicAAResult(const DataLayout &DL, const Function &F, const TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree *DT = nullptr) - : DL(DL), F(F), TLI(TLI), AC(AC), DT(DT) {} + : DL(DL), F(F), TLI(TLI), AC(AC), DT_(DT) {} BasicAAResult(const BasicAAResult &Arg) : AAResultBase(Arg), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI), AC(Arg.AC), - DT(Arg.DT) {} + DT_(Arg.DT_) {} BasicAAResult(BasicAAResult &&Arg) : AAResultBase(std::move(Arg)), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI), - AC(Arg.AC), DT(Arg.DT) {} + AC(Arg.AC), DT_(Arg.DT_) {} /// Handle invalidation events in the new pass manager. bool invalidate(Function &Fn, const PreservedAnalyses &PA, diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h index 6b9d1781820111..91e1872e9bd6ff 100644 --- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h +++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h @@ -122,16 +122,23 @@ class BranchProbabilityInfo { } BranchProbabilityInfo(BranchProbabilityInfo &&Arg) - : Probs(std::move(Arg.Probs)), LastF(Arg.LastF), - EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {} + : Handles(std::move(Arg.Handles)), Probs(std::move(Arg.Probs)), + LastF(Arg.LastF), + EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) { + for (auto &Handle : Handles) + Handle.setBPI(this); + } BranchProbabilityInfo(const BranchProbabilityInfo &) = delete; BranchProbabilityInfo &operator=(const BranchProbabilityInfo &) = delete; BranchProbabilityInfo &operator=(BranchProbabilityInfo &&RHS) { releaseMemory(); + Handles = std::move(RHS.Handles); Probs = std::move(RHS.Probs); EstimatedBlockWeight = std::move(RHS.EstimatedBlockWeight); + for (auto &Handle : Handles) + Handle.setBPI(this); return *this; } @@ -279,6 +286,8 @@ class BranchProbabilityInfo { } public: + void setBPI(BranchProbabilityInfo *BPI) { this->BPI = BPI; } + BasicBlockCallbackVH(const Value *V, BranchProbabilityInfo *BPI = nullptr) : CallbackVH(const_cast(V)), BPI(BPI) {} }; diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h index 2880ed33a34cbc..0926093bba99de 100644 --- a/llvm/include/llvm/Analysis/Loads.h +++ b/llvm/include/llvm/Analysis/Loads.h @@ -18,7 +18,7 @@ namespace llvm { -class AAResults; +class BatchAAResults; class AssumptionCache; class DataLayout; class DominatorTree; @@ -129,11 +129,10 @@ extern cl::opt DefMaxInstsToScan; /// location in memory, as opposed to the value operand of a store. /// /// \returns The found value, or nullptr if no value is found. -Value *FindAvailableLoadedValue(LoadInst *Load, - BasicBlock *ScanBB, +Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan = DefMaxInstsToScan, - AAResults *AA = nullptr, + BatchAAResults *AA = nullptr, bool *IsLoadCSE = nullptr, unsigned *NumScanedInst = nullptr); @@ -141,7 +140,8 @@ Value *FindAvailableLoadedValue(LoadInst *Load, /// FindAvailableLoadedValue() for the case where we are not interested in /// finding the closest clobbering instruction if no available load is found. /// This overload cannot be used to scan across multiple blocks. -Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE, +Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, + bool *IsLoadCSE, unsigned MaxInstsToScan = DefMaxInstsToScan); /// Scan backwards to see if we have the value of the given pointer available @@ -170,7 +170,7 @@ Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE, Value *findAvailablePtrLoadStore(const MemoryLocation &Loc, Type *AccessTy, bool AtLeastAtomic, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, - unsigned MaxInstsToScan, AAResults *AA, + unsigned MaxInstsToScan, BatchAAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst); /// Returns true if a pointer value \p A can be replace with another pointer diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index af3ad822e0b0de..0880f9c65aa45d 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1314,6 +1314,13 @@ class ScalarEvolution { void getPoisonGeneratingValues(SmallPtrSetImpl &Result, const SCEV *S); + /// Check whether it is poison-safe to represent the expression S using the + /// instruction I. If such a replacement is performed, the poison flags of + /// instructions in DropPoisonGeneratingInsts must be dropped. + bool canReuseInstruction( + const SCEV *S, Instruction *I, + SmallVectorImpl &DropPoisonGeneratingInsts); + class FoldID { const SCEV *Op = nullptr; const Type *Ty = nullptr; diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def index f09e12f3038cac..07edf68c667a27 100644 --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -771,8 +771,8 @@ TLI_DEFINE_VECFUNC("log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv") TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv") TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv") -TLI_DEFINE_VECFUNC("modf", "_ZGVsMxvl8_modf", SCALABLE(2), MASKED, "_ZGVsMxvl8") -TLI_DEFINE_VECFUNC("modff", "_ZGVsMxvl4_modff", SCALABLE(4), MASKED, "_ZGVsMxvl4") +TLI_DEFINE_VECFUNC("modf", "_ZGVsNxvl8_modf", SCALABLE(2), NOMASK, "_ZGVsNxvl8") +TLI_DEFINE_VECFUNC("modff", "_ZGVsNxvl4_modff", SCALABLE(4), NOMASK, "_ZGVsNxvl4") TLI_DEFINE_VECFUNC("nextafter", "_ZGVsMxvv_nextafter", SCALABLE(2), MASKED, "_ZGVsMxvv") TLI_DEFINE_VECFUNC("nextafterf", "_ZGVsMxvv_nextafterf", SCALABLE(4), MASKED, "_ZGVsMxvv") @@ -787,11 +787,11 @@ TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv") TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv") TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv") -TLI_DEFINE_VECFUNC("sincos", "_ZGVsMxvl8l8_sincos", SCALABLE(2), MASKED, "_ZGVsMxvl8l8") -TLI_DEFINE_VECFUNC("sincosf", "_ZGVsMxvl4l4_sincosf", SCALABLE(4), MASKED, "_ZGVsMxvl4l4") +TLI_DEFINE_VECFUNC("sincos", "_ZGVsNxvl8l8_sincos", SCALABLE(2), NOMASK, "_ZGVsNxvl8l8") +TLI_DEFINE_VECFUNC("sincosf", "_ZGVsNxvl4l4_sincosf", SCALABLE(4), NOMASK, "_ZGVsNxvl4l4") -TLI_DEFINE_VECFUNC("sincospi", "_ZGVsMxvl8l8_sincospi", SCALABLE(2), MASKED, "_ZGVsMxvl8l8") -TLI_DEFINE_VECFUNC("sincospif", "_ZGVsMxvl4l4_sincospif", SCALABLE(4), MASKED, "_ZGVsMxvl4l4") +TLI_DEFINE_VECFUNC("sincospi", "_ZGVsNxvl8l8_sincospi", SCALABLE(2), NOMASK, "_ZGVsNxvl8l8") +TLI_DEFINE_VECFUNC("sincospif", "_ZGVsNxvl4l4_sincospif", SCALABLE(4), NOMASK, "_ZGVsNxvl4l4") TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh", SCALABLE(2), MASKED, "_ZGVsMxv") TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv") @@ -1005,8 +1005,6 @@ TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_svlog2_f32_x", SCALABLE(4), MASKED, " TLI_DEFINE_VECFUNC("modf", "armpl_vmodfq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8") TLI_DEFINE_VECFUNC("modff", "armpl_vmodfq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4") -TLI_DEFINE_VECFUNC("modf", "armpl_svmodf_f64_x", SCALABLE(2), MASKED, "_ZGVsMxvl8") -TLI_DEFINE_VECFUNC("modff", "armpl_svmodf_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4") TLI_DEFINE_VECFUNC("nextafter", "armpl_vnextafterq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv") TLI_DEFINE_VECFUNC("nextafterf", "armpl_vnextafterq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv") @@ -1035,13 +1033,9 @@ TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_svsin_f32_x", SCALABLE(4), MASKED, "_Z TLI_DEFINE_VECFUNC("sincos", "armpl_vsincosq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8l8") TLI_DEFINE_VECFUNC("sincosf", "armpl_vsincosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4") -TLI_DEFINE_VECFUNC("sincos", "armpl_svsincos_f64_x", SCALABLE(2), MASKED, "_ZGVsMxvl8l8") -TLI_DEFINE_VECFUNC("sincosf", "armpl_svsincos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4l4") TLI_DEFINE_VECFUNC("sincospi", "armpl_vsincospiq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8l8") TLI_DEFINE_VECFUNC("sincospif", "armpl_vsincospiq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4") -TLI_DEFINE_VECFUNC("sincospi", "armpl_svsincospi_f64_x", SCALABLE(2), MASKED, "_ZGVsMxvl8l8") -TLI_DEFINE_VECFUNC("sincospif", "armpl_svsincospi_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4l4") TLI_DEFINE_VECFUNC("sinh", "armpl_vsinhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") TLI_DEFINE_VECFUNC("sinhf", "armpl_vsinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index 7a92e62b53c53d..c6eb66cc9660ca 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -406,6 +406,11 @@ bool maskIsAllZeroOrUndef(Value *Mask); /// lanes can be assumed active. bool maskIsAllOneOrUndef(Value *Mask); +/// Given a mask vector of i1, Return true if any of the elements of this +/// predicate mask are known to be true or undef. That is, return true if at +/// least one lane can be assumed active. +bool maskContainsAllOneOrUndef(Value *Mask); + /// Given a mask vector of the form , return an APInt (of bitwidth Y) /// for each lane which may be active. APInt possiblyDemandedEltsInMask(Value *Mask); diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h index 522ee37da6e830..72461d0d9c316a 100644 --- a/llvm/include/llvm/BinaryFormat/COFF.h +++ b/llvm/include/llvm/BinaryFormat/COFF.h @@ -716,7 +716,10 @@ enum ImportNameType : unsigned { IMPORT_NAME_NOPREFIX = 2, /// The import name is the public symbol name, but skipping the leading ?, /// @, or optionally _, and truncating at the first @. - IMPORT_NAME_UNDECORATE = 3 + IMPORT_NAME_UNDECORATE = 3, + /// The import name is specified as a separate string in the import library + /// object file. + IMPORT_NAME_EXPORTAS = 4 }; enum class GuardFlags : uint32_t { diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 81cdd39afc6bab..f17ba75e3efa6a 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -1464,6 +1464,7 @@ enum { PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, // Fill with random data. PT_OPENBSD_WXNEEDED = 0x65a3dbe7, // Program does W^X violations. PT_OPENBSD_NOBTCFI = 0x65a3dbe8, // Do not enforce branch target CFI. + PT_OPENBSD_SYSCALLS = 0x65a3dbe9, // System call sites. PT_OPENBSD_BOOTDATA = 0x65a41be6, // Section for boot arguments. // ARM program header types. diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h index 0f20a33f3a755c..7990997835d019 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h @@ -35,11 +35,23 @@ struct LegalityQuery; class MachineRegisterInfo; namespace GISelAddressing { /// Helper struct to store a base, index and offset that forms an address -struct BaseIndexOffset { +class BaseIndexOffset { +private: Register BaseReg; Register IndexReg; - int64_t Offset = 0; - bool IsIndexSignExt = false; + std::optional Offset; + +public: + BaseIndexOffset() = default; + Register getBase() { return BaseReg; } + Register getBase() const { return BaseReg; } + Register getIndex() { return IndexReg; } + Register getIndex() const { return IndexReg; } + void setBase(Register NewBase) { BaseReg = NewBase; } + void setIndex(Register NewIndex) { IndexReg = NewIndex; } + void setOffset(std::optional NewOff) { Offset = NewOff; } + bool hasValidOffset() const { return Offset.has_value(); } + int64_t getOffset() const { return *Offset; } }; /// Returns a BaseIndexOffset which describes the pointer in \p Ptr. @@ -89,7 +101,7 @@ class LoadStoreOpt : public MachineFunctionPass { // order stores are writing to incremeneting consecutive addresses. So when // we walk the block in reverse order, the next eligible store must write to // an offset one store width lower than CurrentLowestOffset. - uint64_t CurrentLowestOffset; + int64_t CurrentLowestOffset; SmallVector Stores; // A vector of MachineInstr/unsigned pairs to denote potential aliases that // need to be checked before the candidate is considered safe to merge. The diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h index 76bb34d270a26d..1d40b1cbb0eaa3 100644 --- a/llvm/include/llvm/CodeGen/LivePhysRegs.h +++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h @@ -193,11 +193,18 @@ void addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs); void computeAndAddLiveIns(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB); -/// Convenience function for recomputing live-in's for \p MBB. -static inline void recomputeLiveIns(MachineBasicBlock &MBB) { +/// Convenience function for recomputing live-in's for a MBB. Returns true if +/// any changes were made. +static inline bool recomputeLiveIns(MachineBasicBlock &MBB) { LivePhysRegs LPR; + auto oldLiveIns = MBB.getLiveIns(); + MBB.clearLiveIns(); computeAndAddLiveIns(LPR, MBB); + MBB.sortUniqueLiveIns(); + + auto newLiveIns = MBB.getLiveIns(); + return oldLiveIns != newLiveIns; } } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index c84fd281c6a549..dc2035fa598c46 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -111,6 +111,10 @@ class MachineBasicBlock RegisterMaskPair(MCPhysReg PhysReg, LaneBitmask LaneMask) : PhysReg(PhysReg), LaneMask(LaneMask) {} + + bool operator==(const RegisterMaskPair &other) const { + return PhysReg == other.PhysReg && LaneMask == other.LaneMask; + } }; private: @@ -473,6 +477,8 @@ class MachineBasicBlock /// Remove entry from the livein set and return iterator to the next. livein_iterator removeLiveIn(livein_iterator I); + std::vector getLiveIns() const { return LiveIns; } + class liveout_iterator { public: using iterator_category = std::input_iterator_tag; diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 864f87f3383891..d22eb76d2292d5 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -339,14 +339,26 @@ def UseSampleProfile : StrBoolAttr<"use-sample-profile">; def DenormalFPMath : ComplexStrAttr<"denormal-fp-math", [FnAttr]>; def DenormalFPMathF32 : ComplexStrAttr<"denormal-fp-math-f32", [FnAttr]>; +// Attribute compatiblity rules are generated to check the attribute of the +// caller and callee and decide whether inlining should be allowed. CompatRule +// and child classes are used for the rule generation. CompatRule takes only a +// compare function which could be templated with the attribute type. +// CompatRuleStrAttr takes the compare function and the string attribute for +// checking compatibility for inline substitution. class CompatRule { - // The name of the function called to check the attribute of the caller and - // callee and decide whether inlining should be allowed. The function's - // signature must match "bool(const Function&, const Function &)", where the - // first parameter is the reference to the caller and the second parameter is - // the reference to the callee. It must return false if the attributes of the - // caller and callee are incompatible, and true otherwise. + // The function's signature must match "bool(const Function&, const + // Function&)", where the first parameter is the reference to the caller and + // the second parameter is the reference to the callee. It must return false + // if the attributes of the caller and callee are incompatible, and true + // otherwise. string CompatFunc = F; + string AttrName = ""; +} + +class CompatRuleStrAttr : CompatRule { + // The checker function is extended with an third argument as the function + // attribute string "bool(const Function&, const Function&, const StringRef&)". + string AttrName = Attr; } def : CompatRule<"isEqual">; @@ -359,7 +371,9 @@ def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"checkDenormMode">; - +def : CompatRuleStrAttr<"isEqual", "sign-return-address">; +def : CompatRuleStrAttr<"isEqual", "sign-return-address-key">; +def : CompatRuleStrAttr<"isEqual", "branch-protection-pauth-lr">; class MergeRule { // The name of the function called to merge the attributes of the caller and diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e6db9da5526aa3..c5f43d17d1c148 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn : [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; +def int_amdgcn_s_wait_event_export_ready : + ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, + Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] +>; + // WMMA (Wave Matrix Multiply-Accumulate) intrinsics // // These operations perform a matrix multiplication and accumulation of @@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn : class AMDGPUWmmaIntrinsic : Intrinsic< - [CD], // %D + [CD], // %D [ AB, // %A - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] @@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic : class AMDGPUWmmaIntrinsicOPSEL : Intrinsic< - [CD], // %D + [CD], // %D [ AB, // %A - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C - llvm_i1_ty, // %high + llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12 ], [IntrNoMem, IntrConvergent, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; class AMDGPUWmmaIntrinsicIU : Intrinsic< - [CD], // %D + [CD], // %D [ llvm_i1_ty, // %A_sign AB, // %A llvm_i1_ty, // %B_sign - AB, // %B + LLVMMatchType<1>, // %B LLVMMatchType<0>, // %C llvm_i1_ty, // %clamp ], [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; -def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; -// The regular, untied f16/bf16 wmma intrinsics only write to one half -// of the registers (set via the op_sel bit). -// The content of the other 16-bit of the registers is undefined. -def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; -// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix -// registers to the input accumulator registers. -// Essentially, the content of the other 16-bit is preserved from the input. -def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; -def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; +// WMMA GFX11Only -def int_amdgcn_s_wait_event_export_ready : - ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, - Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] ->; +// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. +// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. +// The content of the other 16-bit half is preserved from the input. +def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL; + +// WMMA GFX11Plus + +def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; +def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; + +// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. +// The content of the other 16-bit half is undefined. +// GFX12: The op_sel bit must be 0. +def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; //===----------------------------------------------------------------------===// // GFX12 Intrinsics @@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var" [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + +// WMMA (Wave Matrix Multiply-Accumulate) intrinsics +// +// These operations perform a matrix multiplication and accumulation of +// the form: D = A * B + C . + +// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. +def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic; +// A and B are <16 x iu4>. +def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU; + +// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics +// +// These operations perform a sparse matrix multiplication and accumulation of +// the form: D = A * B + C. +// A is sparse matrix, half the size of B, and is expanded using sparsity index. + +class AMDGPUSWmmacIntrinsicIdx : + Intrinsic< + [CD], // %D + [ + A, // %A + B, // %B + LLVMMatchType<0>, // %C + Index // %Sparsity index for A + ], + [IntrNoMem, IntrConvergent, IntrWillReturn] +>; + +class AMDGPUSWmmacIntrinsicIUIdx : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_sign + A, // %A + llvm_i1_ty, // %B_sign + B, // %B + LLVMMatchType<0>, // %C + Index, // %Sparsity index for A + llvm_i1_ty, // %clamp + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>] +>; + +def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx; +def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx; +def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx; + def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn; @@ -2712,6 +2777,10 @@ class AMDGPULoadTr: def int_amdgcn_global_load_tr : AMDGPULoadTr; +// i32 @llvm.amdgcn.wave.id() +def int_amdgcn_wave_id : + DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h index a548b2c15c5fdc..2a5c3d8913b15c 100644 --- a/llvm/include/llvm/Object/COFF.h +++ b/llvm/include/llvm/Object/COFF.h @@ -1362,6 +1362,47 @@ class SectionStrippedError SectionStrippedError() { setErrorCode(object_error::section_stripped); } }; +inline std::optional +getArm64ECMangledFunctionName(StringRef Name) { + bool IsCppFn = Name[0] == '?'; + if (IsCppFn && Name.find("$$h") != std::string::npos) + return std::nullopt; + if (!IsCppFn && Name[0] == '#') + return std::nullopt; + + StringRef Prefix = "$$h"; + size_t InsertIdx = 0; + if (IsCppFn) { + InsertIdx = Name.find("@@"); + size_t ThreeAtSignsIdx = Name.find("@@@"); + if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) { + InsertIdx += 2; + } else { + InsertIdx = Name.find("@"); + if (InsertIdx != std::string::npos) + InsertIdx++; + } + } else { + Prefix = "#"; + } + + return std::optional( + (Name.substr(0, InsertIdx) + Prefix + Name.substr(InsertIdx)).str()); +} + +inline std::optional +getArm64ECDemangledFunctionName(StringRef Name) { + if (Name[0] == '#') + return std::string(Name.substr(1)); + if (Name[0] != '?') + return std::nullopt; + + std::pair Pair = Name.split("$$h"); + if (Pair.second.empty()) + return std::nullopt; + return (Pair.first + Pair.second).str(); +} + } // end namespace object } // end namespace llvm diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h index edc836ff0348cb..8358197309f000 100644 --- a/llvm/include/llvm/Object/COFFImportFile.h +++ b/llvm/include/llvm/Object/COFFImportFile.h @@ -26,7 +26,16 @@ namespace llvm { namespace object { +constexpr std::string_view ImportDescriptorPrefix = "__IMPORT_DESCRIPTOR_"; +constexpr std::string_view NullImportDescriptorSymbolName = + "__NULL_IMPORT_DESCRIPTOR"; +constexpr std::string_view NullThunkDataPrefix = "\x7f"; +constexpr std::string_view NullThunkDataSuffix = "_NULL_THUNK_DATA"; + class COFFImportFile : public SymbolicFile { +private: + enum SymbolIndex { ImpSymbol, ThunkSymbol, ECAuxSymbol, ECThunkSymbol }; + public: COFFImportFile(MemoryBufferRef Source) : SymbolicFile(ID_COFFImportFile, Source) {} @@ -36,9 +45,23 @@ class COFFImportFile : public SymbolicFile { void moveSymbolNext(DataRefImpl &Symb) const override { ++Symb.p; } Error printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override { - if (Symb.p == 0) + switch (Symb.p) { + case ImpSymbol: OS << "__imp_"; - OS << StringRef(Data.getBufferStart() + sizeof(coff_import_header)); + break; + case ECAuxSymbol: + OS << "__imp_aux_"; + break; + } + const char *Name = Data.getBufferStart() + sizeof(coff_import_header); + if (Symb.p != ECThunkSymbol && COFF::isArm64EC(getMachine())) { + if (std::optional DemangledName = + getArm64ECDemangledFunctionName(Name)) { + OS << StringRef(*DemangledName); + return Error::success(); + } + } + OS << StringRef(Name); return Error::success(); } @@ -52,7 +75,12 @@ class COFFImportFile : public SymbolicFile { basic_symbol_iterator symbol_end() const override { DataRefImpl Symb; - Symb.p = isData() ? 1 : 2; + if (isData()) + Symb.p = ImpSymbol + 1; + else if (COFF::isArm64EC(getMachine())) + Symb.p = ECThunkSymbol + 1; + else + Symb.p = ThunkSymbol + 1; return BasicSymbolRef(Symb, this); } @@ -66,6 +94,7 @@ class COFFImportFile : public SymbolicFile { uint16_t getMachine() const { return getCOFFImportHeader()->Machine; } StringRef getFileFormatName() const; + StringRef getExportName() const; private: bool isData() const { diff --git a/llvm/include/llvm/Support/FormattedStream.h b/llvm/include/llvm/Support/FormattedStream.h index 5f937cfa798408..850a18dbb94121 100644 --- a/llvm/include/llvm/Support/FormattedStream.h +++ b/llvm/include/llvm/Support/FormattedStream.h @@ -52,6 +52,10 @@ class formatted_raw_ostream : public raw_ostream { /// have the rest of it. SmallString<4> PartialUTF8Char; + /// DisableScan - Temporarily disable scanning of output. Used to ignore color + /// codes. + bool DisableScan; + void write_impl(const char *Ptr, size_t Size) override; /// current_pos - Return the current position within the stream, @@ -89,9 +93,33 @@ class formatted_raw_ostream : public raw_ostream { SetUnbuffered(); TheStream->SetUnbuffered(); + enable_colors(TheStream->colors_enabled()); + Scanned = nullptr; } + void PreDisableScan() { + assert(!DisableScan); + ComputePosition(getBufferStart(), GetNumBytesInBuffer()); + assert(PartialUTF8Char.empty()); + DisableScan = true; + } + + void PostDisableScan() { + assert(DisableScan); + DisableScan = false; + Scanned = getBufferStart() + GetNumBytesInBuffer(); + } + + struct DisableScanScope { + formatted_raw_ostream *S; + + DisableScanScope(formatted_raw_ostream *FRO) : S(FRO) { + S->PreDisableScan(); + } + ~DisableScanScope() { S->PostDisableScan(); } + }; + public: /// formatted_raw_ostream - Open the specified file for /// writing. If an error occurs, information about the error is @@ -104,12 +132,12 @@ class formatted_raw_ostream : public raw_ostream { /// underneath it. /// formatted_raw_ostream(raw_ostream &Stream) - : TheStream(nullptr), Position(0, 0) { + : TheStream(nullptr), Position(0, 0), DisableScan(false) { setStream(Stream); } - explicit formatted_raw_ostream() : TheStream(nullptr), Position(0, 0) { - Scanned = nullptr; - } + explicit formatted_raw_ostream() + : TheStream(nullptr), Position(0, 0), Scanned(nullptr), + DisableScan(false) {} ~formatted_raw_ostream() override { flush(); @@ -136,17 +164,26 @@ class formatted_raw_ostream : public raw_ostream { } raw_ostream &resetColor() override { - TheStream->resetColor(); + if (colors_enabled()) { + DisableScanScope S(this); + raw_ostream::resetColor(); + } return *this; } raw_ostream &reverseColor() override { - TheStream->reverseColor(); + if (colors_enabled()) { + DisableScanScope S(this); + raw_ostream::reverseColor(); + } return *this; } raw_ostream &changeColor(enum Colors Color, bool Bold, bool BG) override { - TheStream->changeColor(Color, Bold, BG); + if (colors_enabled()) { + DisableScanScope S(this); + raw_ostream::changeColor(Color, Bold, BG); + } return *this; } diff --git a/llvm/include/llvm/Support/X86FoldTablesUtils.h b/llvm/include/llvm/Support/X86FoldTablesUtils.h index ed244febc38d3a..77d32cc7fb37ed 100644 --- a/llvm/include/llvm/Support/X86FoldTablesUtils.h +++ b/llvm/include/llvm/Support/X86FoldTablesUtils.h @@ -46,11 +46,12 @@ enum { // Broadcast type. // (stored in bits 12 - 14) TB_BCAST_TYPE_SHIFT = TB_ALIGN_SHIFT + 3, - TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT, - TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT, - TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT, - TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT, - TB_BCAST_SH = 4 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_W = 0 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_D = 1 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_Q = 2 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SS = 3 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SD = 4 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SH = 5 << TB_BCAST_TYPE_SHIFT, TB_BCAST_MASK = 0x7 << TB_BCAST_TYPE_SHIFT, // Unused bits 15-16 diff --git a/llvm/include/llvm/Target/TargetInstrPredicate.td b/llvm/include/llvm/Target/TargetInstrPredicate.td index 82c4c7b23a49b6..b5419cb9f3867f 100644 --- a/llvm/include/llvm/Target/TargetInstrPredicate.td +++ b/llvm/include/llvm/Target/TargetInstrPredicate.td @@ -152,6 +152,34 @@ class CheckImmOperand_s : CheckOperandBase { string ImmVal = Value; } +// Check that the operand at position `Index` is less than `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandLT : CheckOperandBase { + int ImmVal = Imm; +} + +// Check that the operand at position `Index` is greater than `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandGT : CheckOperandBase { + int ImmVal = Imm; +} + +// Check that the operand at position `Index` is less than or equal to `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandLE : CheckNot>; + +// Check that the operand at position `Index` is greater than or equal to `Imm`. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against `Imm`. +class CheckImmOperandGE : CheckNot>; + // Expands to a call to `FunctionMapper` if field `FunctionMapper` is set. // Otherwise, it expands to a CheckNot>. class CheckRegOperandSimple : CheckOperandBase; @@ -203,6 +231,12 @@ class CheckAll Sequence> class CheckAny Sequence> : CheckPredicateSequence; +// Check that the operand at position `Index` is in range [Start, End]. +// If field `FunctionMapper` is a non-empty string, then function +// `FunctionMapper` is applied to the operand value, and the return value is then +// compared against range [Start, End]. +class CheckImmOperandRange + : CheckAll<[CheckImmOperandGE, CheckImmOperandLE]>; // Used to expand the body of a function predicate. See the definition of // TIIPredicate below. diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td index 032de728517827..40c2cce8c6effe 100644 --- a/llvm/include/llvm/Target/TargetSchedule.td +++ b/llvm/include/llvm/Target/TargetSchedule.td @@ -620,7 +620,7 @@ class SecondFusionPredicateWithMCInstPredicate : FusionPredicateWithMCInstPredicate; // The pred will be applied on both firstMI and secondMI. class BothFusionPredicateWithMCInstPredicate - : FusionPredicateWithMCInstPredicate; + : FusionPredicateWithMCInstPredicate; // Tie firstOpIdx and secondOpIdx. The operand of `FirstMI` at position // `firstOpIdx` should be the same as the operand of `SecondMI` at position diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 623fdc21ba65a6..c10f92e2871747 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -478,7 +478,7 @@ inline constexpr ArchInfo ARMV8_1A = { VersionTuple{8, 1}, AProfile, "armv8.1-a inline constexpr ArchInfo ARMV8_2A = { VersionTuple{8, 2}, AProfile, "armv8.2-a", "+v8.2a", (ARMV8_1A.DefaultExts | AArch64::ExtensionBitset({AArch64::AEK_RAS}))}; inline constexpr ArchInfo ARMV8_3A = { VersionTuple{8, 3}, AProfile, "armv8.3-a", "+v8.3a", (ARMV8_2A.DefaultExts | - AArch64::ExtensionBitset({AArch64::AEK_RCPC, AArch64::AEK_JSCVT, AArch64::AEK_FCMA}))}; + AArch64::ExtensionBitset({AArch64::AEK_FCMA, AArch64::AEK_JSCVT, AArch64::AEK_PAUTH, AArch64::AEK_RCPC}))}; inline constexpr ArchInfo ARMV8_4A = { VersionTuple{8, 4}, AProfile, "armv8.4-a", "+v8.4a", (ARMV8_3A.DefaultExts | AArch64::ExtensionBitset({AArch64::AEK_DOTPROD}))}; inline constexpr ArchInfo ARMV8_5A = { VersionTuple{8, 5}, AProfile, "armv8.5-a", "+v8.5a", (ARMV8_4A.DefaultExts)}; @@ -805,6 +805,12 @@ inline constexpr CpuInfo CpuInfos[] = { {AArch64::AEK_FP16, AArch64::AEK_RAND, AArch64::AEK_SM4, AArch64::AEK_SHA3, AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_MTE, AArch64::AEK_SB, AArch64::AEK_SSBS}))}, + {"ampere1b", ARMV8_7A, + (AArch64::ExtensionBitset({AArch64::AEK_FP16, AArch64::AEK_RAND, + AArch64::AEK_SM4, AArch64::AEK_SHA3, + AArch64::AEK_SHA2, AArch64::AEK_AES, + AArch64::AEK_MTE, AArch64::AEK_SB, + AArch64::AEK_SSBS, AArch64::AEK_CSSC}))}, }; // An alias for a CPU. @@ -813,7 +819,8 @@ struct CpuAlias { StringRef Name; }; -inline constexpr CpuAlias CpuAliases[] = {{"grace", "neoverse-v2"}}; +inline constexpr CpuAlias CpuAliases[] = {{"cobalt-100", "neoverse-n2"}, + {"grace", "neoverse-v2"}}; bool getExtensionFeatures( const AArch64::ExtensionBitset &Extensions, diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 870dc75b1c1f80..49ec8de9c528de 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -1033,11 +1033,11 @@ class Triple { isWindowsCygwinEnvironment() || isOHOSFamily(); } - /// Tests whether the target uses TLS Descriptor by default. + /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC + /// is enabled by default. bool hasDefaultTLSDESC() const { // TODO: Improve check for other platforms, like Android, and RISC-V - // Note: This is currently only used on RISC-V. - return isOSBinFormatELF() && isAArch64(); + return false; } /// Tests whether the target uses -data-sections as default. diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 3178e2d2781674..1028b52a79123f 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -89,7 +89,7 @@ bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA, // may be created without handles to some analyses and in that case don't // depend on them. if (Inv.invalidate(Fn, PA) || - (DT && Inv.invalidate(Fn, PA))) + (DT_ && Inv.invalidate(Fn, PA))) return true; // Otherwise this analysis result remains valid. @@ -1063,6 +1063,7 @@ AliasResult BasicAAResult::aliasGEP( : AliasResult::MayAlias; } + DominatorTree *DT = getDT(AAQI); DecomposedGEP DecompGEP1 = DecomposeGEPExpression(GEP1, DL, &AC, DT); DecomposedGEP DecompGEP2 = DecomposeGEPExpression(V2, DL, &AC, DT); @@ -1556,6 +1557,7 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size, const Value *HintO1 = getUnderlyingObject(Hint1); const Value *HintO2 = getUnderlyingObject(Hint2); + DominatorTree *DT = getDT(AAQI); auto ValidAssumeForPtrContext = [&](const Value *Ptr) { if (const Instruction *PtrI = dyn_cast(Ptr)) { return isValidAssumeForContext(Assume, PtrI, DT, @@ -1735,7 +1737,7 @@ bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V, if (!Inst || Inst->getParent()->isEntryBlock()) return true; - return isNotInCycle(Inst, DT, /*LI*/ nullptr); + return isNotInCycle(Inst, getDT(AAQI), /*LI*/ nullptr); } /// Computes the symbolic difference between two de-composed GEPs. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index d0c27cae0dff99..72b6dfa181e86d 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -439,7 +439,8 @@ static Value *threadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS, // Check that the simplified value has the form "X op Y" where "op" is the // same as the original operation. Instruction *Simplified = dyn_cast(FV ? FV : TV); - if (Simplified && Simplified->getOpcode() == unsigned(Opcode)) { + if (Simplified && Simplified->getOpcode() == unsigned(Opcode) && + !Simplified->hasPoisonGeneratingFlags()) { // The value that didn't simplify is "UnsimplifiedLHS op UnsimplifiedRHS". // We already know that "op" is the same as for the simplified value. See // if the operands match too. If so, return the simplified value. diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 1ebc593016bc0d..16635097d20afe 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -657,11 +657,12 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk, BasicBlock::iterator BBI = L->getIterator(); BasicBlock *BB = L->getParent(); SmallPtrSet VisitedBlocks; + BatchAAResults BatchAA(*AA); for (;;) { if (!VisitedBlocks.insert(BB).second) break; if (Value *U = - FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, AA)) + FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, &BatchAA)) return findValueImpl(U, OffsetOk, Visited); if (BBI != BB->begin()) break; diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 97d21db86abf28..5916d2ab48ecec 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -364,7 +364,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size, if (Size.getBitWidth() > 64) return false; - const uint64_t LoadSize = Size.getZExtValue(); + const TypeSize LoadSize = TypeSize::getFixed(Size.getZExtValue()); // Otherwise, be a little bit aggressive by scanning the local block where we // want to check to see if the pointer is already being loaded or stored @@ -414,11 +414,11 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size, // Handle trivial cases. if (AccessedPtr == V && - LoadSize <= DL.getTypeStoreSize(AccessedTy)) + TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy))) return true; if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) && - LoadSize <= DL.getTypeStoreSize(AccessedTy)) + TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy))) return true; } return false; @@ -450,11 +450,10 @@ llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden, "to scan backward from a given instruction, when searching for " "available loaded value")); -Value *llvm::FindAvailableLoadedValue(LoadInst *Load, - BasicBlock *ScanBB, +Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan, - AAResults *AA, bool *IsLoad, + BatchAAResults *AA, bool *IsLoad, unsigned *NumScanedInst) { // Don't CSE load that is volatile or anything stronger than unordered. if (!Load->isUnordered()) @@ -583,7 +582,7 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr, Value *llvm::findAvailablePtrLoadStore( const MemoryLocation &Loc, Type *AccessTy, bool AtLeastAtomic, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan, - AAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) { + BatchAAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) { if (MaxInstsToScan == 0) MaxInstsToScan = ~0U; @@ -664,7 +663,7 @@ Value *llvm::findAvailablePtrLoadStore( return nullptr; } -Value *llvm::FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, +Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, bool *IsLoadCSE, unsigned MaxInstsToScan) { const DataLayout &DL = Load->getModule()->getDataLayout(); diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7e67c90152829d..dd6b88fee415a7 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -657,16 +657,18 @@ class AccessAnalysis { AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI, MemoryDepChecker::DepCandidates &DA, - PredicatedScalarEvolution &PSE) - : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE) { + PredicatedScalarEvolution &PSE, + SmallPtrSetImpl &LoopAliasScopes) + : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), + LoopAliasScopes(LoopAliasScopes) { // We're analyzing dependences across loop iterations. BAA.enableCrossIterationMode(); } /// Register a load and whether it is only read from. void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { - Value *Ptr = const_cast(Loc.Ptr); - AST.add(Loc.getWithNewSize(LocationSize::beforeOrAfterPointer())); + Value *Ptr = const_cast(Loc.Ptr); + AST.add(adjustLoc(Loc)); Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy); if (IsReadOnly) ReadOnlyPtr.insert(Ptr); @@ -674,8 +676,8 @@ class AccessAnalysis { /// Register a store. void addStore(MemoryLocation &Loc, Type *AccessTy) { - Value *Ptr = const_cast(Loc.Ptr); - AST.add(Loc.getWithNewSize(LocationSize::beforeOrAfterPointer())); + Value *Ptr = const_cast(Loc.Ptr); + AST.add(adjustLoc(Loc)); Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy); } @@ -731,6 +733,32 @@ class AccessAnalysis { private: typedef MapVector> PtrAccessMap; + /// Adjust the MemoryLocation so that it represents accesses to this + /// location across all iterations, rather than a single one. + MemoryLocation adjustLoc(MemoryLocation Loc) const { + // The accessed location varies within the loop, but remains within the + // underlying object. + Loc.Size = LocationSize::beforeOrAfterPointer(); + Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope); + Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias); + return Loc; + } + + /// Drop alias scopes that are only valid within a single loop iteration. + MDNode *adjustAliasScopeList(MDNode *ScopeList) const { + if (!ScopeList) + return nullptr; + + // For the sake of simplicity, drop the whole scope list if any scope is + // iteration-local. + if (any_of(ScopeList->operands(), [&](Metadata *Scope) { + return LoopAliasScopes.contains(cast(Scope)); + })) + return nullptr; + + return ScopeList; + } + /// Go over all memory access and check whether runtime pointer checks /// are needed and build sets of dependency check candidates. void processMemAccesses(); @@ -775,6 +803,10 @@ class AccessAnalysis { PredicatedScalarEvolution &PSE; DenseMap> UnderlyingObjects; + + /// Alias scopes that are declared inside the loop, and as such not valid + /// across iterations. + SmallPtrSetImpl &LoopAliasScopes; }; } // end anonymous namespace @@ -2283,6 +2315,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, // Holds the Load and Store instructions. SmallVector Loads; SmallVector Stores; + SmallPtrSet LoopAliasScopes; // Holds all the different accesses in the loop. unsigned NumReads = 0; @@ -2326,6 +2359,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, if (HasComplexMemInst) continue; + // Record alias scopes defined inside the loop. + if (auto *Decl = dyn_cast(&I)) + for (Metadata *Op : Decl->getScopeList()->operands()) + LoopAliasScopes.insert(cast(Op)); + // Many math library functions read the rounding mode. We will only // vectorize a loop if it contains known function calls that don't set // the flag. Therefore, it is safe to ignore this read from memory. @@ -2407,7 +2445,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, } MemoryDepChecker::DepCandidates DependentAccesses; - AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE); + AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE, + LoopAliasScopes); // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp index e87ae7d71fffe2..aa550f0b6a7bfd 100644 --- a/llvm/lib/Analysis/MemorySSAUpdater.cpp +++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp @@ -692,25 +692,9 @@ void MemorySSAUpdater::updateForClonedLoop(const LoopBlocksRPO &LoopBlocks, continue; // Determine incoming value and add it as incoming from IncBB. - if (MemoryUseOrDef *IncMUD = dyn_cast(IncomingAccess)) { - if (!MSSA->isLiveOnEntryDef(IncMUD)) { - Instruction *IncI = IncMUD->getMemoryInst(); - assert(IncI && "Found MemoryUseOrDef with no Instruction."); - if (Instruction *NewIncI = - cast_or_null(VMap.lookup(IncI))) { - IncMUD = MSSA->getMemoryAccess(NewIncI); - assert(IncMUD && - "MemoryUseOrDef cannot be null, all preds processed."); - } - } - NewPhi->addIncoming(IncMUD, IncBB); - } else { - MemoryPhi *IncPhi = cast(IncomingAccess); - if (MemoryAccess *NewDefPhi = MPhiMap.lookup(IncPhi)) - NewPhi->addIncoming(NewDefPhi, IncBB); - else - NewPhi->addIncoming(IncPhi, IncBB); - } + NewPhi->addIncoming( + getNewDefiningAccessForClone(IncomingAccess, VMap, MPhiMap, MSSA), + IncBB); } if (auto *SingleAccess = onlySingleValue(NewPhi)) { MPhiMap[Phi] = SingleAccess; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 2acb45837c480a..4b2db80bc1ec30 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -4184,6 +4184,68 @@ void ScalarEvolution::getPoisonGeneratingValues( Result.insert(SU->getValue()); } +bool ScalarEvolution::canReuseInstruction( + const SCEV *S, Instruction *I, + SmallVectorImpl &DropPoisonGeneratingInsts) { + // If the instruction cannot be poison, it's always safe to reuse. + if (programUndefinedIfPoison(I)) + return true; + + // Otherwise, it is possible that I is more poisonous that S. Collect the + // poison-contributors of S, and then check whether I has any additional + // poison-contributors. Poison that is contributed through poison-generating + // flags is handled by dropping those flags instead. + SmallPtrSet PoisonVals; + getPoisonGeneratingValues(PoisonVals, S); + + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(I); + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + if (!Visited.insert(V).second) + continue; + + // Avoid walking large instruction graphs. + if (Visited.size() > 16) + return false; + + // Either the value can't be poison, or the S would also be poison if it + // is. + if (PoisonVals.contains(V) || isGuaranteedNotToBePoison(V)) + continue; + + auto *I = dyn_cast(V); + if (!I) + return false; + + // Disjoint or instructions are interpreted as adds by SCEV. However, we + // can't replace an arbitrary add with disjoint or, even if we drop the + // flag. We would need to convert the or into an add. + if (auto *PDI = dyn_cast(I)) + if (PDI->isDisjoint()) + return false; + + // FIXME: Ignore vscale, even though it technically could be poison. Do this + // because SCEV currently assumes it can't be poison. Remove this special + // case once we proper model when vscale can be poison. + if (auto *II = dyn_cast(I); + II && II->getIntrinsicID() == Intrinsic::vscale) + continue; + + if (canCreatePoison(cast(I), /*ConsiderFlagsAndMetadata*/ false)) + return false; + + // If the instruction can't create poison, we can recurse to its operands. + if (I->hasPoisonGeneratingFlagsOrMetadata()) + DropPoisonGeneratingInsts.push_back(I); + + for (Value *Op : I->operands()) + Worklist.push_back(Op); + } + return true; +} + const SCEV * ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind, SmallVectorImpl &Ops) { diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 5d6c3465a0c364..9f9451e4e814ac 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -5083,8 +5083,13 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts, Op->getOperand(0)->getType()->getScalarType()->getFltSemantics(); // All subnormal inputs should be in the normal range in the result type. - if (APFloat::isRepresentableAsNormalIn(SrcTy, DstTy)) + if (APFloat::isRepresentableAsNormalIn(SrcTy, DstTy)) { + if (Known.KnownFPClasses & fcPosSubnormal) + Known.KnownFPClasses |= fcPosNormal; + if (Known.KnownFPClasses & fcNegSubnormal) + Known.KnownFPClasses |= fcNegNormal; Known.knownNot(fcSubnormal); + } // Sign bit of a nan isn't guaranteed. if (!Known.isKnownNeverNaN()) @@ -5981,6 +5986,8 @@ void llvm::getUnderlyingObjects(const Value *V, if (!LI || !LI->isLoopHeader(PN->getParent()) || isSameUnderlyingObjectInLoop(PN, LI)) append_range(Worklist, PN->incoming_values()); + else + Objects.push_back(P); continue; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 73facc76a92b2c..bf7bc0ba84a033 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1012,6 +1012,31 @@ bool llvm::maskIsAllOneOrUndef(Value *Mask) { return true; } +bool llvm::maskContainsAllOneOrUndef(Value *Mask) { + assert(isa(Mask->getType()) && + isa(Mask->getType()->getScalarType()) && + cast(Mask->getType()->getScalarType())->getBitWidth() == + 1 && + "Mask must be a vector of i1"); + + auto *ConstMask = dyn_cast(Mask); + if (!ConstMask) + return false; + if (ConstMask->isAllOnesValue() || isa(ConstMask)) + return true; + if (isa(ConstMask->getType())) + return false; + for (unsigned + I = 0, + E = cast(ConstMask->getType())->getNumElements(); + I != E; ++I) { + if (auto *MaskElt = ConstMask->getAggregateElement(I)) + if (MaskElt->isAllOnesValue() || isa(MaskElt)) + return true; + } + return false; +} + /// TODO: This is a lot like known bits, but for /// vectors. Is there something we can common this with? APInt llvm::possiblyDemandedEltsInMask(Value *Mask) { diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index a9f78358e57b92..ecf7bc30913f51 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -2048,8 +2048,10 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { FBB->erase(FBB->begin(), FIB); if (UpdateLiveIns) { - recomputeLiveIns(*TBB); - recomputeLiveIns(*FBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*TBB) || recomputeLiveIns(*FBB); + } while (anyChange); } ++NumHoist; diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 8ee1f19e083e4e..1cca56fc19cfd8 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8154,6 +8154,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI, IRBuilder<> Builder(Branch); if (UI->getParent() != Branch->getParent()) UI->moveBefore(Branch); + UI->dropPoisonGeneratingFlags(); Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI, ConstantInt::get(UI->getType(), 0)); LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n"); @@ -8167,6 +8168,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI, IRBuilder<> Builder(Branch); if (UI->getParent() != Branch->getParent()) UI->moveBefore(Branch); + UI->dropPoisonGeneratingFlags(); Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI, ConstantInt::get(UI->getType(), 0)); LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n"); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 772229215e798d..61ddc858ba44c7 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -591,8 +591,8 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI, UseMI.getOpcode() == TargetOpcode::G_ZEXT || (UseMI.getOpcode() == TargetOpcode::G_ANYEXT)) { const auto &MMO = LoadMI->getMMO(); - // For atomics, only form anyextending loads. - if (MMO.isAtomic() && UseMI.getOpcode() != TargetOpcode::G_ANYEXT) + // Don't do anything for atomics. + if (MMO.isAtomic()) continue; // Check for legality. if (!isPreLegalize()) { diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 3b2cf319109273..47d045ac48171e 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -596,6 +596,8 @@ llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall, LostDebugLocObserver &LocObserver, MachineInstr *MI) { auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); const char *Name = TLI.getLibcallName(Libcall); + if (!Name) + return LegalizerHelper::UnableToLegalize; const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall); return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI); } @@ -4178,6 +4180,10 @@ LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI, } } + // Set the insert point after the existing PHIs + MachineBasicBlock &MBB = *MI.getParent(); + MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI()); + // Merge small outputs into MI's def. if (NumLeftovers) { mergeMixedSubvectors(MI.getReg(0), OutputRegs); diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp index 246aa88b09acf6..ee499c41c558c3 100644 --- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp @@ -84,21 +84,20 @@ BaseIndexOffset GISelAddressing::getPointerInfo(Register Ptr, MachineRegisterInfo &MRI) { BaseIndexOffset Info; Register PtrAddRHS; - if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(Info.BaseReg), m_Reg(PtrAddRHS)))) { - Info.BaseReg = Ptr; - Info.IndexReg = Register(); - Info.IsIndexSignExt = false; + Register BaseReg; + if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(BaseReg), m_Reg(PtrAddRHS)))) { + Info.setBase(Ptr); + Info.setOffset(0); return Info; } - + Info.setBase(BaseReg); auto RHSCst = getIConstantVRegValWithLookThrough(PtrAddRHS, MRI); if (RHSCst) - Info.Offset = RHSCst->Value.getSExtValue(); + Info.setOffset(RHSCst->Value.getSExtValue()); // Just recognize a simple case for now. In future we'll need to match // indexing patterns for base + index + constant. - Info.IndexReg = PtrAddRHS; - Info.IsIndexSignExt = false; + Info.setIndex(PtrAddRHS); return Info; } @@ -114,15 +113,16 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1, BaseIndexOffset BasePtr0 = getPointerInfo(LdSt1->getPointerReg(), MRI); BaseIndexOffset BasePtr1 = getPointerInfo(LdSt2->getPointerReg(), MRI); - if (!BasePtr0.BaseReg.isValid() || !BasePtr1.BaseReg.isValid()) + if (!BasePtr0.getBase().isValid() || !BasePtr1.getBase().isValid()) return false; int64_t Size1 = LdSt1->getMemSize(); int64_t Size2 = LdSt2->getMemSize(); int64_t PtrDiff; - if (BasePtr0.BaseReg == BasePtr1.BaseReg) { - PtrDiff = BasePtr1.Offset - BasePtr0.Offset; + if (BasePtr0.getBase() == BasePtr1.getBase() && BasePtr0.hasValidOffset() && + BasePtr1.hasValidOffset()) { + PtrDiff = BasePtr1.getOffset() - BasePtr0.getOffset(); // If the size of memory access is unknown, do not use it to do analysis. // One example of unknown size memory access is to load/store scalable // vector objects on the stack. @@ -151,8 +151,8 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1, // able to calculate their relative offset if at least one arises // from an alloca. However, these allocas cannot overlap and we // can infer there is no alias. - auto *Base0Def = getDefIgnoringCopies(BasePtr0.BaseReg, MRI); - auto *Base1Def = getDefIgnoringCopies(BasePtr1.BaseReg, MRI); + auto *Base0Def = getDefIgnoringCopies(BasePtr0.getBase(), MRI); + auto *Base1Def = getDefIgnoringCopies(BasePtr1.getBase(), MRI); if (!Base0Def || !Base1Def) return false; // Couldn't tell anything. @@ -520,16 +520,20 @@ bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI, Register StoreAddr = StoreMI.getPointerReg(); auto BIO = getPointerInfo(StoreAddr, *MRI); - Register StoreBase = BIO.BaseReg; - uint64_t StoreOffCst = BIO.Offset; + Register StoreBase = BIO.getBase(); if (C.Stores.empty()) { + C.BasePtr = StoreBase; + if (!BIO.hasValidOffset()) { + C.CurrentLowestOffset = 0; + } else { + C.CurrentLowestOffset = BIO.getOffset(); + } // This is the first store of the candidate. // If the offset can't possibly allow for a lower addressed store with the // same base, don't bother adding it. - if (StoreOffCst < ValueTy.getSizeInBytes()) + if (BIO.hasValidOffset() && + BIO.getOffset() < static_cast(ValueTy.getSizeInBytes())) return false; - C.BasePtr = StoreBase; - C.CurrentLowestOffset = StoreOffCst; C.Stores.emplace_back(&StoreMI); LLVM_DEBUG(dbgs() << "Starting a new merge candidate group with: " << StoreMI); @@ -549,8 +553,12 @@ bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI, // writes to the next lowest adjacent address. if (C.BasePtr != StoreBase) return false; - if ((C.CurrentLowestOffset - ValueTy.getSizeInBytes()) != - static_cast(StoreOffCst)) + // If we don't have a valid offset, we can't guarantee to be an adjacent + // offset. + if (!BIO.hasValidOffset()) + return false; + if ((C.CurrentLowestOffset - + static_cast(ValueTy.getSizeInBytes())) != BIO.getOffset()) return false; // This writes to an adjacent address. Allow it. diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index cbb1a74049fbd7..7e9c992031f8d3 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -236,7 +236,8 @@ namespace { /// was successfully coalesced away. If it is not currently possible to /// coalesce this interval, but it may be possible if other things get /// coalesced, then it returns true by reference in 'Again'. - bool joinCopy(MachineInstr *CopyMI, bool &Again); + bool joinCopy(MachineInstr *CopyMI, bool &Again, + SmallPtrSetImpl &CurrentErasedInstrs); /// Attempt to join these two intervals. On failure, this /// returns false. The output "SrcInt" will not have been modified, so we @@ -1964,7 +1965,9 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } -bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { +bool RegisterCoalescer::joinCopy( + MachineInstr *CopyMI, bool &Again, + SmallPtrSetImpl &CurrentErasedInstrs) { Again = false; LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI); @@ -2156,7 +2159,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { // CopyMI has been erased by joinIntervals at this point. Remove it from // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back // to the work list. This keeps ErasedInstrs from growing needlessly. - ErasedInstrs.erase(CopyMI); + if (ErasedInstrs.erase(CopyMI)) + // But we may encounter the instruction again in this iteration. + CurrentErasedInstrs.insert(CopyMI); // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. @@ -3982,21 +3987,33 @@ void RegisterCoalescer::lateLiveIntervalUpdate() { bool RegisterCoalescer:: copyCoalesceWorkList(MutableArrayRef CurrList) { bool Progress = false; + SmallPtrSet CurrentErasedInstrs; for (MachineInstr *&MI : CurrList) { if (!MI) continue; // Skip instruction pointers that have already been erased, for example by // dead code elimination. - if (ErasedInstrs.count(MI)) { + if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) { MI = nullptr; continue; } bool Again = false; - bool Success = joinCopy(MI, Again); + bool Success = joinCopy(MI, Again, CurrentErasedInstrs); Progress |= Success; if (Success || !Again) MI = nullptr; } + // Clear instructions not recorded in `ErasedInstrs` but erased. + if (!CurrentErasedInstrs.empty()) { + for (MachineInstr *&MI : CurrList) { + if (MI && CurrentErasedInstrs.count(MI)) + MI = nullptr; + } + for (MachineInstr *&MI : WorkList) { + if (MI && CurrentErasedInstrs.count(MI)) + MI = nullptr; + } + } return Progress; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 98d8a6d9409f25..5038f8a1fc1562 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3575,6 +3575,11 @@ static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI, return SDValue(); if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) return SDValue(); + // Guarantee identical type of CarryOut + EVT CarryOutType = N->getValueType(0); + if (CarryOutType != Carry0.getValue(1).getValueType() || + CarryOutType != Carry1.getValue(1).getValueType()) + return SDValue(); // Canonicalize the add/sub of A and B (the top node in the above ASCII art) // as Carry0 and the add/sub of the carry in as Carry1 (the middle node). @@ -3622,7 +3627,7 @@ static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI, // TODO: match other operations that can merge flags (ADD, etc) DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0)); if (N->getOpcode() == ISD::AND) - return DAG.getConstant(0, DL, MVT::i1); + return DAG.getConstant(0, DL, CarryOutType); return Merged.getValue(1); } @@ -9253,7 +9258,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) - DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); + DAG.makeEquivalentMemoryOrdering(L, NewLoad); if (!NeedsBswap) return NewLoad; @@ -9631,8 +9636,15 @@ static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) { if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth()) return false; + // The fold is not valid if the sum of the shift values doesn't fit in the + // given shift amount type. + bool Overflow = false; + APInt NewShiftAmt = C1Val.uadd_ov(*ShiftAmtVal, Overflow); + if (Overflow) + return false; + // The fold is not valid if the sum of the shift values exceeds bitwidth. - if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits())) + if (NewShiftAmt.uge(V.getScalarValueSizeInBits())) return false; return true; diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index fd5160209506f2..19076771ff2eaf 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -2045,6 +2045,11 @@ static bool isEqual(const Function &Caller, const Function &Callee) { Callee.getFnAttribute(AttrClass::getKind()); } +static bool isEqual(const Function &Caller, const Function &Callee, + const StringRef &AttrName) { + return Caller.getFnAttribute(AttrName) == Callee.getFnAttribute(AttrName); +} + /// Compute the logical AND of the attributes of the caller and the /// callee. /// diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index cbb64b299e648e..f105bdb4816aa0 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -746,7 +746,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp, Min = Min.zext(ResultBitWidth); Max = Max.zext(ResultBitWidth); } - return ConstantRange(std::move(Min), std::move(Max)); + return getNonEmpty(std::move(Min), std::move(Max) + 1); } case Instruction::SIToFP: { // TODO: use input range if available @@ -757,7 +757,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp, SMin = SMin.sext(ResultBitWidth); SMax = SMax.sext(ResultBitWidth); } - return ConstantRange(std::move(SMin), std::move(SMax)); + return getNonEmpty(std::move(SMin), std::move(SMax) + 1); } case Instruction::FPTrunc: case Instruction::FPExt: diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 8e508dbdb1c69b..026d252ec5bcd7 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -44,6 +44,7 @@ #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCSymbolMachO.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/Casting.h" @@ -1950,7 +1951,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, Lex(); } - if (MAI.hasSubsectionsViaSymbols() && CFIStartProcLoc && Sym->isExternal()) + if (MAI.hasSubsectionsViaSymbols() && CFIStartProcLoc && + Sym->isExternal() && !cast(Sym)->isAltEntry()) return Error(StartTokLoc, "non-private labels cannot appear between " ".cfi_startproc / .cfi_endproc pairs") && Error(*CFIStartProcLoc, "previous .cfi_startproc was here"); diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp index 155926a8c5949d..1a7ed2db543964 100644 --- a/llvm/lib/Object/ArchiveWriter.cpp +++ b/llvm/lib/Object/ArchiveWriter.cpp @@ -677,6 +677,13 @@ static bool isECObject(object::SymbolicFile &Obj) { return false; } +bool isImportDescriptor(StringRef Name) { + return Name.starts_with(ImportDescriptorPrefix) || + Name == StringRef{NullImportDescriptorSymbolName} || + (Name.starts_with(NullThunkDataPrefix) && + Name.ends_with(NullThunkDataSuffix)); +} + static Expected> getSymbols(SymbolicFile *Obj, uint16_t Index, raw_ostream &SymNames, @@ -704,6 +711,10 @@ static Expected> getSymbols(SymbolicFile *Obj, if (Map == &SymMap->Map) { Ret.push_back(SymNames.tell()); SymNames << Name << '\0'; + // If EC is enabled, then the import descriptors are NOT put into EC + // objects so we need to copy them to the EC map manually. + if (SymMap->UseECMap && isImportDescriptor(Name)) + SymMap->ECMap[Name] = Index; } } else { Ret.push_back(SymNames.tell()); diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp index 60556c149bf735..d3b5cf2d9f7b52 100644 --- a/llvm/lib/Object/COFFImportFile.cpp +++ b/llvm/lib/Object/COFFImportFile.cpp @@ -52,6 +52,38 @@ StringRef COFFImportFile::getFileFormatName() const { } } +StringRef COFFImportFile::getExportName() const { + const coff_import_header *hdr = getCOFFImportHeader(); + StringRef name = Data.getBuffer().substr(sizeof(*hdr)).split('\0').first; + + auto ltrim1 = [](StringRef s, StringRef chars) { + return !s.empty() && chars.contains(s[0]) ? s.substr(1) : s; + }; + + switch (hdr->getNameType()) { + case IMPORT_ORDINAL: + name = ""; + break; + case IMPORT_NAME_NOPREFIX: + name = ltrim1(name, "?@_"); + break; + case IMPORT_NAME_UNDECORATE: + name = ltrim1(name, "?@_"); + name = name.substr(0, name.find('@')); + break; + case IMPORT_NAME_EXPORTAS: { + // Skip DLL name + name = Data.getBuffer().substr(sizeof(*hdr) + name.size() + 1); + name = name.split('\0').second.split('\0').first; + break; + } + default: + break; + } + + return name; +} + static uint16_t getImgRelRelocation(MachineTypes Machine) { switch (Machine) { default: @@ -76,7 +108,7 @@ template static void append(std::vector &B, const T &Data) { } static void writeStringTable(std::vector &B, - ArrayRef Strings) { + ArrayRef Strings) { // The COFF string table consists of a 4-byte value which is the size of the // table, including the length field itself. This value is followed by the // string content itself, which is an array of null-terminated C-style @@ -139,9 +171,6 @@ static Expected replace(StringRef S, StringRef From, return (Twine(S.substr(0, Pos)) + To + S.substr(Pos + From.size())).str(); } -static const std::string NullImportDescriptorSymbolName = - "__NULL_IMPORT_DESCRIPTOR"; - namespace { // This class constructs various small object files necessary to support linking // symbols imported from a DLL. The contents are pretty strictly defined and @@ -160,8 +189,9 @@ class ObjectFactory { public: ObjectFactory(StringRef S, MachineTypes M) : NativeMachine(M), ImportName(S), Library(llvm::sys::path::stem(S)), - ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()), - NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {} + ImportDescriptorSymbolName((ImportDescriptorPrefix + Library).str()), + NullThunkSymbolName( + (NullThunkDataPrefix + Library + NullThunkDataSuffix).str()) {} // Creates an Import Descriptor. This is a small object file which contains a // reference to the terminators and contains the library name (entry) for the @@ -183,6 +213,7 @@ class ObjectFactory { // Library Format. NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal, ImportType Type, ImportNameType NameType, + StringRef ExportName, MachineTypes Machine); // Create a weak external file which is described in PE/COFF Aux Format 3. @@ -474,12 +505,13 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector &Buffer) { return {MemoryBufferRef{F, ImportName}}; } -NewArchiveMember ObjectFactory::createShortImport(StringRef Sym, - uint16_t Ordinal, - ImportType ImportType, - ImportNameType NameType, - MachineTypes Machine) { +NewArchiveMember +ObjectFactory::createShortImport(StringRef Sym, uint16_t Ordinal, + ImportType ImportType, ImportNameType NameType, + StringRef ExportName, MachineTypes Machine) { size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs + if (!ExportName.empty()) + ImpSize += ExportName.size() + 1; size_t Size = sizeof(coff_import_header) + ImpSize; char *Buf = Alloc.Allocate(Size); memset(Buf, 0, Size); @@ -499,6 +531,10 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym, memcpy(P, Sym.data(), Sym.size()); P += Sym.size() + 1; memcpy(P, ImportName.data(), ImportName.size()); + if (!ExportName.empty()) { + P += ImportName.size() + 1; + memcpy(P, ExportName.data(), ExportName.size()); + } return {MemoryBufferRef(StringRef(Buf, Size), ImportName)}; } @@ -615,27 +651,51 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path, ImportType = IMPORT_CONST; StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName; - ImportNameType NameType = E.Noname - ? IMPORT_ORDINAL - : getNameType(SymbolName, E.Name, - Machine, MinGW); - Expected Name = E.ExtName.empty() - ? std::string(SymbolName) - : replace(SymbolName, E.Name, E.ExtName); - - if (!Name) - return Name.takeError(); - - if (!E.AliasTarget.empty() && *Name != E.AliasTarget) { + std::string Name; + + if (E.ExtName.empty()) { + Name = std::string(SymbolName); + } else { + Expected ReplacedName = + replace(SymbolName, E.Name, E.ExtName); + if (!ReplacedName) + return ReplacedName.takeError(); + Name.swap(*ReplacedName); + } + + if (!E.AliasTarget.empty() && Name != E.AliasTarget) { Members.push_back( - OF.createWeakExternal(E.AliasTarget, *Name, false, Machine)); + OF.createWeakExternal(E.AliasTarget, Name, false, Machine)); Members.push_back( - OF.createWeakExternal(E.AliasTarget, *Name, true, Machine)); + OF.createWeakExternal(E.AliasTarget, Name, true, Machine)); continue; } - Members.push_back( - OF.createShortImport(*Name, E.Ordinal, ImportType, NameType, Machine)); + ImportNameType NameType; + std::string ExportName; + if (E.Noname) { + NameType = IMPORT_ORDINAL; + } else { + NameType = getNameType(SymbolName, E.Name, Machine, MinGW); + } + + // On ARM64EC, use EXPORTAS to import demangled name for mangled symbols. + if (ImportType == IMPORT_CODE && isArm64EC(Machine)) { + if (std::optional MangledName = + getArm64ECMangledFunctionName(Name)) { + if (ExportName.empty()) { + NameType = IMPORT_NAME_EXPORTAS; + ExportName.swap(Name); + } + Name = std::move(*MangledName); + } else if (ExportName.empty()) { + NameType = IMPORT_NAME_EXPORTAS; + ExportName = std::move(*getArm64ECDemangledFunctionName(Name)); + } + } + + Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType, + NameType, ExportName, Machine)); } return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab, diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp index da8e1d87319dde..a357b4cb492111 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -14,6 +14,7 @@ #include "llvm/ProfileData/Coverage/CoverageMapping.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -583,6 +584,160 @@ static unsigned getMaxBitmapSize(const CounterMappingContext &Ctx, return MaxBitmapID + (SizeInBits / CHAR_BIT); } +namespace { + +/// Collect Decisions, Branchs, and Expansions and associate them. +class MCDCDecisionRecorder { +private: + /// This holds the DecisionRegion and MCDCBranches under it. + /// Also traverses Expansion(s). + /// The Decision has the number of MCDCBranches and will complete + /// when it is filled with unique ConditionID of MCDCBranches. + struct DecisionRecord { + const CounterMappingRegion *DecisionRegion; + + /// They are reflected from DecisionRegion for convenience. + LineColPair DecisionStartLoc; + LineColPair DecisionEndLoc; + + /// This is passed to `MCDCRecordProcessor`, so this should be compatible + /// to`ArrayRef`. + SmallVector MCDCBranches; + + /// IDs that are stored in MCDCBranches + /// Complete when all IDs (1 to NumConditions) are met. + DenseSet ConditionIDs; + + /// Set of IDs of Expansion(s) that are relevant to DecisionRegion + /// and its children (via expansions). + /// FileID pointed by ExpandedFileID is dedicated to the expansion, so + /// the location in the expansion doesn't matter. + DenseSet ExpandedFileIDs; + + DecisionRecord(const CounterMappingRegion &Decision) + : DecisionRegion(&Decision), DecisionStartLoc(Decision.startLoc()), + DecisionEndLoc(Decision.endLoc()) { + assert(Decision.Kind == CounterMappingRegion::MCDCDecisionRegion); + } + + /// Determine whether DecisionRecord dominates `R`. + bool dominates(const CounterMappingRegion &R) const { + // Determine whether `R` is included in `DecisionRegion`. + if (R.FileID == DecisionRegion->FileID && + R.startLoc() >= DecisionStartLoc && R.endLoc() <= DecisionEndLoc) + return true; + + // Determine whether `R` is pointed by any of Expansions. + return ExpandedFileIDs.contains(R.FileID); + } + + enum Result { + NotProcessed = 0, /// Irrelevant to this Decision + Processed, /// Added to this Decision + Completed, /// Added and filled this Decision + }; + + /// Add Branch into the Decision + /// \param Branch expects MCDCBranchRegion + /// \returns NotProcessed/Processed/Completed + Result addBranch(const CounterMappingRegion &Branch) { + assert(Branch.Kind == CounterMappingRegion::MCDCBranchRegion); + + auto ConditionID = Branch.MCDCParams.ID; + assert(ConditionID > 0 && "ConditionID should begin with 1"); + + if (ConditionIDs.contains(ConditionID) || + ConditionID > DecisionRegion->MCDCParams.NumConditions) + return NotProcessed; + + if (!this->dominates(Branch)) + return NotProcessed; + + assert(MCDCBranches.size() < DecisionRegion->MCDCParams.NumConditions); + + // Put `ID=1` in front of `MCDCBranches` for convenience + // even if `MCDCBranches` is not topological. + if (ConditionID == 1) + MCDCBranches.insert(MCDCBranches.begin(), &Branch); + else + MCDCBranches.push_back(&Branch); + + // Mark `ID` as `assigned`. + ConditionIDs.insert(ConditionID); + + // `Completed` when `MCDCBranches` is full + return (MCDCBranches.size() == DecisionRegion->MCDCParams.NumConditions + ? Completed + : Processed); + } + + /// Record Expansion if it is relevant to this Decision. + /// Each `Expansion` may nest. + /// \returns true if recorded. + bool recordExpansion(const CounterMappingRegion &Expansion) { + if (!this->dominates(Expansion)) + return false; + + ExpandedFileIDs.insert(Expansion.ExpandedFileID); + return true; + } + }; + +private: + /// Decisions in progress + /// DecisionRecord is added for each MCDCDecisionRegion. + /// DecisionRecord is removed when Decision is completed. + SmallVector Decisions; + +public: + ~MCDCDecisionRecorder() { + assert(Decisions.empty() && "All Decisions have not been resolved"); + } + + /// Register Region and start recording. + void registerDecision(const CounterMappingRegion &Decision) { + Decisions.emplace_back(Decision); + } + + void recordExpansion(const CounterMappingRegion &Expansion) { + any_of(Decisions, [&Expansion](auto &Decision) { + return Decision.recordExpansion(Expansion); + }); + } + + using DecisionAndBranches = + std::pair /// Branches + >; + + /// Add MCDCBranchRegion to DecisionRecord. + /// \param Branch to be processed + /// \returns DecisionsAndBranches if DecisionRecord completed. + /// Or returns nullopt. + std::optional + processBranch(const CounterMappingRegion &Branch) { + // Seek each Decision and apply Region to it. + for (auto DecisionIter = Decisions.begin(), DecisionEnd = Decisions.end(); + DecisionIter != DecisionEnd; ++DecisionIter) + switch (DecisionIter->addBranch(Branch)) { + case DecisionRecord::NotProcessed: + continue; + case DecisionRecord::Processed: + return std::nullopt; + case DecisionRecord::Completed: + DecisionAndBranches Result = + std::make_pair(DecisionIter->DecisionRegion, + std::move(DecisionIter->MCDCBranches)); + Decisions.erase(DecisionIter); // No longer used. + return Result; + } + + llvm_unreachable("Branch not found in Decisions"); + } +}; + +} // namespace + Error CoverageMapping::loadFunctionRecord( const CoverageMappingRecord &Record, IndexedInstrProfReader &ProfileReader) { @@ -639,18 +794,13 @@ Error CoverageMapping::loadFunctionRecord( Record.MappingRegions[0].Count.isZero() && Counts[0] > 0) return Error::success(); - unsigned NumConds = 0; - const CounterMappingRegion *MCDCDecision; - std::vector MCDCBranches; - + MCDCDecisionRecorder MCDCDecisions; FunctionRecord Function(OrigFuncName, Record.Filenames); for (const auto &Region : Record.MappingRegions) { - // If an MCDCDecisionRegion is seen, track the BranchRegions that follow - // it according to Region.NumConditions. + // MCDCDecisionRegion should be handled first since it overlaps with + // others inside. if (Region.Kind == CounterMappingRegion::MCDCDecisionRegion) { - assert(NumConds == 0); - MCDCDecision = &Region; - NumConds = Region.MCDCParams.NumConditions; + MCDCDecisions.registerDecision(Region); continue; } Expected ExecutionCount = Ctx.evaluate(Region.Count); @@ -665,43 +815,47 @@ Error CoverageMapping::loadFunctionRecord( } Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount); - // If a MCDCDecisionRegion was seen, store the BranchRegions that - // correspond to it in a vector, according to the number of conditions - // recorded for the region (tracked by NumConds). - if (NumConds > 0 && Region.Kind == CounterMappingRegion::MCDCBranchRegion) { - MCDCBranches.push_back(&Region); - - // As we move through all of the MCDCBranchRegions that follow the - // MCDCDecisionRegion, decrement NumConds to make sure we account for - // them all before we calculate the bitmap of executed test vectors. - if (--NumConds == 0) { - // Evaluating the test vector bitmap for the decision region entails - // calculating precisely what bits are pertinent to this region alone. - // This is calculated based on the recorded offset into the global - // profile bitmap; the length is calculated based on the recorded - // number of conditions. - Expected ExecutedTestVectorBitmap = - Ctx.evaluateBitmap(MCDCDecision); - if (auto E = ExecutedTestVectorBitmap.takeError()) { - consumeError(std::move(E)); - return Error::success(); - } + // Record ExpansionRegion. + if (Region.Kind == CounterMappingRegion::ExpansionRegion) { + MCDCDecisions.recordExpansion(Region); + continue; + } - // Since the bitmap identifies the executed test vectors for an MC/DC - // DecisionRegion, all of the information is now available to process. - // This is where the bulk of the MC/DC progressing takes place. - Expected Record = Ctx.evaluateMCDCRegion( - *MCDCDecision, *ExecutedTestVectorBitmap, MCDCBranches); - if (auto E = Record.takeError()) { - consumeError(std::move(E)); - return Error::success(); - } + // Do nothing unless MCDCBranchRegion. + if (Region.Kind != CounterMappingRegion::MCDCBranchRegion) + continue; - // Save the MC/DC Record so that it can be visualized later. - Function.pushMCDCRecord(*Record); - MCDCBranches.clear(); - } + auto Result = MCDCDecisions.processBranch(Region); + if (!Result) // Any Decision doesn't complete. + continue; + + auto MCDCDecision = Result->first; + auto &MCDCBranches = Result->second; + + // Evaluating the test vector bitmap for the decision region entails + // calculating precisely what bits are pertinent to this region alone. + // This is calculated based on the recorded offset into the global + // profile bitmap; the length is calculated based on the recorded + // number of conditions. + Expected ExecutedTestVectorBitmap = + Ctx.evaluateBitmap(MCDCDecision); + if (auto E = ExecutedTestVectorBitmap.takeError()) { + consumeError(std::move(E)); + return Error::success(); } + + // Since the bitmap identifies the executed test vectors for an MC/DC + // DecisionRegion, all of the information is now available to process. + // This is where the bulk of the MC/DC progressing takes place. + Expected Record = Ctx.evaluateMCDCRegion( + *MCDCDecision, *ExecutedTestVectorBitmap, MCDCBranches); + if (auto E = Record.takeError()) { + consumeError(std::move(E)); + return Error::success(); + } + + // Save the MC/DC Record so that it can be visualized later. + Function.pushMCDCRecord(*Record); } // Don't create records for (filenames, function) pairs we've already seen. diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp index 1c7d8a8909c488..27727f216b0513 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp @@ -167,7 +167,15 @@ void CoverageMappingWriter::write(raw_ostream &OS) { return LHS.FileID < RHS.FileID; if (LHS.startLoc() != RHS.startLoc()) return LHS.startLoc() < RHS.startLoc(); - return LHS.Kind < RHS.Kind; + + // Put `Decision` before `Expansion`. + auto getKindKey = [](CounterMappingRegion::RegionKind Kind) { + return (Kind == CounterMappingRegion::MCDCDecisionRegion + ? 2 * CounterMappingRegion::ExpansionRegion - 1 + : 2 * Kind); + }; + + return getKindKey(LHS.Kind) < getKindKey(RHS.Kind); }); // Write out the fileid -> filename mapping. diff --git a/llvm/lib/Support/FormattedStream.cpp b/llvm/lib/Support/FormattedStream.cpp index c0d28435099570..c50530e76efc0a 100644 --- a/llvm/lib/Support/FormattedStream.cpp +++ b/llvm/lib/Support/FormattedStream.cpp @@ -94,6 +94,9 @@ void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) { /// ComputePosition - Examine the current output and update line and column /// counts. void formatted_raw_ostream::ComputePosition(const char *Ptr, size_t Size) { + if (DisableScan) + return; + // If our previous scan pointer is inside the buffer, assume we already // scanned those bytes. This depends on raw_ostream to not change our buffer // in unexpected ways. diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp index 3c02492e99f1db..db2e4ca92ae40a 100644 --- a/llvm/lib/Support/RISCVISAInfo.cpp +++ b/llvm/lib/Support/RISCVISAInfo.cpp @@ -128,6 +128,7 @@ static const RISCVSupportedExtension SupportedExtensions[] = { {"zicclsm", {1, 0}}, {"ziccrse", {1, 0}}, {"zicntr", {2, 0}}, + {"zicond", {1, 0}}, {"zicsr", {2, 0}}, {"zifencei", {2, 0}}, {"zihintntl", {1, 0}}, @@ -200,8 +201,6 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = { {"zicfilp", {0, 4}}, {"zicfiss", {0, 4}}, - {"zicond", {1, 0}}, - {"zimop", {0, 1}}, {"ztso", {0, 1}}, diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 36700f73df4b20..feabd137c0cf1d 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -837,6 +837,7 @@ include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" include "AArch64SchedAmpere1.td" +include "AArch64SchedAmpere1B.td" include "AArch64SchedNeoverseN1.td" include "AArch64SchedNeoverseN2.td" include "AArch64SchedNeoverseV1.td" @@ -1376,6 +1377,24 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", FeatureLdpAlignedOnly, FeatureStpAlignedOnly]>; +def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B", + "Ampere Computing Ampere-1B processors", [ + FeaturePostRAScheduler, + FeatureFuseAES, + FeatureFuseAdrpAdd, + FeatureAddrLSLFast, + FeatureALULSLFast, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeatureCmpBccFusion, + FeatureFuseAddress, + FeatureFuseLiterals, + FeatureStorePairSuppress, + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive, + FeatureLdpAlignedOnly, + FeatureStpAlignedOnly]>; + def ProcessorFeatures { list A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeaturePerfMon]; @@ -1529,6 +1548,11 @@ def ProcessorFeatures { FeatureMTE, FeatureSSBS, FeatureRandGen, FeatureSB, FeatureSM4, FeatureSHA2, FeatureSHA3, FeatureAES]; + list Ampere1B = [HasV8_7aOps, FeatureNEON, FeaturePerfMon, + FeatureMTE, FeatureSSBS, FeatureRandGen, + FeatureSB, FeatureSM4, FeatureSHA2, + FeatureSHA3, FeatureAES, FeatureCSSC, + FeatureWFxT, FeatureFullFP16]; // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1696,6 +1720,9 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A, [TuneAmpere1A]>; +def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B, + [TuneAmpere1B]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 11248bb7aef31f..55c5bbc66a3f4f 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -24,11 +24,13 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/InitializePasses.h" +#include "llvm/Object/COFF.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/TargetParser/Triple.h" using namespace llvm; +using namespace llvm::object; using OperandBundleDef = OperandBundleDefT; @@ -43,6 +45,8 @@ static cl::opt GenerateThunks("arm64ec-generate-thunks", cl::Hidden, namespace { +enum class ThunkType { GuestExit, Entry, Exit }; + class AArch64Arm64ECCallLowering : public ModulePass { public: static char ID; @@ -69,14 +73,14 @@ class AArch64Arm64ECCallLowering : public ModulePass { Type *I64Ty; Type *VoidTy; - void getThunkType(FunctionType *FT, AttributeList AttrList, bool EntryThunk, + void getThunkType(FunctionType *FT, AttributeList AttrList, ThunkType TT, raw_ostream &Out, FunctionType *&Arm64Ty, FunctionType *&X64Ty); void getThunkRetType(FunctionType *FT, AttributeList AttrList, raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy, SmallVectorImpl &Arm64ArgTypes, SmallVectorImpl &X64ArgTypes, bool &HasSretPtr); - void getThunkArgTypes(FunctionType *FT, AttributeList AttrList, + void getThunkArgTypes(FunctionType *FT, AttributeList AttrList, ThunkType TT, raw_ostream &Out, SmallVectorImpl &Arm64ArgTypes, SmallVectorImpl &X64ArgTypes, bool HasSretPtr); @@ -89,10 +93,11 @@ class AArch64Arm64ECCallLowering : public ModulePass { void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT, AttributeList AttrList, - bool EntryThunk, raw_ostream &Out, + ThunkType TT, raw_ostream &Out, FunctionType *&Arm64Ty, FunctionType *&X64Ty) { - Out << (EntryThunk ? "$ientry_thunk$cdecl$" : "$iexit_thunk$cdecl$"); + Out << (TT == ThunkType::Entry ? "$ientry_thunk$cdecl$" + : "$iexit_thunk$cdecl$"); Type *Arm64RetTy; Type *X64RetTy; @@ -102,8 +107,8 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT, // The first argument to a thunk is the called function, stored in x9. // For exit thunks, we pass the called function down to the emulator; - // for entry thunks, we just call the Arm64 function directly. - if (!EntryThunk) + // for entry/guest exit thunks, we just call the Arm64 function directly. + if (TT == ThunkType::Exit) Arm64ArgTypes.push_back(PtrTy); X64ArgTypes.push_back(PtrTy); @@ -111,14 +116,16 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT, getThunkRetType(FT, AttrList, Out, Arm64RetTy, X64RetTy, Arm64ArgTypes, X64ArgTypes, HasSretPtr); - getThunkArgTypes(FT, AttrList, Out, Arm64ArgTypes, X64ArgTypes, HasSretPtr); + getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes, + HasSretPtr); Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false); + X64Ty = FunctionType::get(X64RetTy, X64ArgTypes, false); } void AArch64Arm64ECCallLowering::getThunkArgTypes( - FunctionType *FT, AttributeList AttrList, raw_ostream &Out, + FunctionType *FT, AttributeList AttrList, ThunkType TT, raw_ostream &Out, SmallVectorImpl &Arm64ArgTypes, SmallVectorImpl &X64ArgTypes, bool HasSretPtr) { @@ -156,9 +163,11 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes( X64ArgTypes.push_back(PtrTy); // x5 Arm64ArgTypes.push_back(I64Ty); - // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we - // have proper isel for varargs - X64ArgTypes.push_back(I64Ty); + if (TT != ThunkType::Entry) { + // FIXME: x5 isn't actually used by the x64 side; revisit once we + // have proper isel for varargs + X64ArgTypes.push_back(I64Ty); + } return; } @@ -339,8 +348,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT, SmallString<256> ExitThunkName; llvm::raw_svector_ostream ExitThunkStream(ExitThunkName); FunctionType *Arm64Ty, *X64Ty; - getThunkType(FT, Attrs, /*EntryThunk*/ false, ExitThunkStream, Arm64Ty, - X64Ty); + getThunkType(FT, Attrs, ThunkType::Exit, ExitThunkStream, Arm64Ty, X64Ty); if (Function *F = M->getFunction(ExitThunkName)) return F; @@ -443,7 +451,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { SmallString<256> EntryThunkName; llvm::raw_svector_ostream EntryThunkStream(EntryThunkName); FunctionType *Arm64Ty, *X64Ty; - getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true, + getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::Entry, EntryThunkStream, Arm64Ty, X64Ty); if (Function *F = M->getFunction(EntryThunkName)) return F; @@ -465,10 +473,11 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy(); unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1; + unsigned PassthroughArgSize = F->isVarArg() ? 5 : Thunk->arg_size(); // Translate arguments to call. SmallVector Args; - for (unsigned i = ThunkArgOffset, e = Thunk->arg_size(); i != e; ++i) { + for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) { Value *Arg = Thunk->getArg(i); Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset); if (ArgTy->isArrayTy() || ArgTy->isStructTy() || @@ -485,6 +494,22 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { Args.push_back(Arg); } + if (F->isVarArg()) { + // The 5th argument to variadic entry thunks is used to model the x64 sp + // which is passed to the thunk in x4, this can be passed to the callee as + // the variadic argument start address after skipping over the 32 byte + // shadow store. + + // The EC thunk CC will assign any argument marked as InReg to x4. + Thunk->addParamAttr(5, Attribute::InReg); + Value *Arg = Thunk->getArg(5); + Arg = IRB.CreatePtrAdd(Arg, IRB.getInt64(0x20)); + Args.push_back(Arg); + + // Pass in a zero variadic argument size (in x5). + Args.push_back(IRB.getInt64(0)); + } + // Call the function passed to the thunk. Value *Callee = Thunk->getArg(0); Callee = IRB.CreateBitCast(Callee, PtrTy); @@ -518,7 +543,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { llvm::raw_null_ostream NullThunkName; FunctionType *Arm64Ty, *X64Ty; - getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true, + getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty); auto MangledName = getArm64ECMangledFunctionName(F->getName().str()); assert(MangledName && "Can't guest exit to function that's already native"); diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 78ea4a5180f703..8e67f0f5c8815f 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -213,6 +213,9 @@ def CC_AArch64_Arm64EC_VarArg : CallingConv<[ // address is passed in X9. let Entry = 1 in def CC_AArch64_Arm64EC_Thunk : CallingConv<[ + // ARM64EC-specific: the InReg attribute can be used to access the x64 sp passed into entry thunks in x4 from the IR. + CCIfInReg>>, + // Byval aggregates are passed by pointer CCIfByVal>, diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 352c61d48e2fff..1af064b6de3cba 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1544,6 +1544,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; } + case AArch64::COALESCER_BARRIER_FPR16: + case AArch64::COALESCER_BARRIER_FPR32: + case AArch64::COALESCER_BARRIER_FPR64: + case AArch64::COALESCER_BARRIER_FPR128: + MI.eraseFromParent(); + return true; case AArch64::LD1B_2Z_IMM_PSEUDO: return expandMultiVecPseudo( MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index d55deec9760092..732e787d2a321d 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -4339,8 +4339,10 @@ AArch64FrameLowering::inlineStackProbeLoopExactMultiple( ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); MBB.addSuccessor(LoopMBB); // Update liveins. - recomputeLiveIns(*LoopMBB); - recomputeLiveIns(*ExitMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB); + } while (anyChange); return ExitMBB->begin(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 332fb37655288c..95d8ab95b2c097 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1658,40 +1658,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setMaxAtomicSizeInBitsSupported(128); if (Subtarget->isWindowsArm64EC()) { - // FIXME: are there other intrinsics we need to add here? - setLibcallName(RTLIB::MEMCPY, "#memcpy"); - setLibcallName(RTLIB::MEMSET, "#memset"); - setLibcallName(RTLIB::MEMMOVE, "#memmove"); - setLibcallName(RTLIB::REM_F32, "#fmodf"); - setLibcallName(RTLIB::REM_F64, "#fmod"); - setLibcallName(RTLIB::FMA_F32, "#fmaf"); - setLibcallName(RTLIB::FMA_F64, "#fma"); - setLibcallName(RTLIB::SQRT_F32, "#sqrtf"); - setLibcallName(RTLIB::SQRT_F64, "#sqrt"); - setLibcallName(RTLIB::CBRT_F32, "#cbrtf"); - setLibcallName(RTLIB::CBRT_F64, "#cbrt"); - setLibcallName(RTLIB::LOG_F32, "#logf"); - setLibcallName(RTLIB::LOG_F64, "#log"); - setLibcallName(RTLIB::LOG2_F32, "#log2f"); - setLibcallName(RTLIB::LOG2_F64, "#log2"); - setLibcallName(RTLIB::LOG10_F32, "#log10f"); - setLibcallName(RTLIB::LOG10_F64, "#log10"); - setLibcallName(RTLIB::EXP_F32, "#expf"); - setLibcallName(RTLIB::EXP_F64, "#exp"); - setLibcallName(RTLIB::EXP2_F32, "#exp2f"); - setLibcallName(RTLIB::EXP2_F64, "#exp2"); - setLibcallName(RTLIB::EXP10_F32, "#exp10f"); - setLibcallName(RTLIB::EXP10_F64, "#exp10"); - setLibcallName(RTLIB::SIN_F32, "#sinf"); - setLibcallName(RTLIB::SIN_F64, "#sin"); - setLibcallName(RTLIB::COS_F32, "#cosf"); - setLibcallName(RTLIB::COS_F64, "#cos"); - setLibcallName(RTLIB::POW_F32, "#powf"); - setLibcallName(RTLIB::POW_F64, "#pow"); - setLibcallName(RTLIB::LDEXP_F32, "#ldexpf"); - setLibcallName(RTLIB::LDEXP_F64, "#ldexp"); - setLibcallName(RTLIB::FREXP_F32, "#frexpf"); - setLibcallName(RTLIB::FREXP_F64, "#frexp"); + // FIXME: are there intrinsics we need to exclude from this? + for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) { + auto code = static_cast(i); + auto libcallName = getLibcallName(code); + if ((libcallName != nullptr) && (libcallName[0] != '#')) { + setLibcallName(code, Saver.save(Twine("#") + libcallName).data()); + } + } } } @@ -2375,6 +2349,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; + MAKE_CASE(AArch64ISD::COALESCER_BARRIER) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) @@ -7154,13 +7129,18 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, } } +static bool isPassedInFPR(EVT VT) { + return VT.isFixedLengthVector() || + (VT.isFloatingPoint() && !VT.isScalableVector()); +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, - SDValue ThisVal) const { + SDValue ThisVal, bool RequiresSMChange) const { DenseMap CopiedRegs; // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { @@ -7205,6 +7185,10 @@ SDValue AArch64TargetLowering::LowerCallResult( break; } + if (RequiresSMChange && isPassedInFPR(VA.getValVT())) + Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(), + Val); + InVals.push_back(Val); } @@ -7915,6 +7899,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return ArgReg.Reg == VA.getLocReg(); }); } else { + // Add an extra level of indirection for streaming mode changes by + // using a pseudo copy node that cannot be rematerialised between a + // smstart/smstop and the call by the simple register coalescer. + if (RequiresSMChange && isPassedInFPR(Arg.getValueType())) + Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, + Arg.getValueType(), Arg); RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; @@ -7991,11 +7981,19 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } if (IsVarArg && Subtarget->isWindowsArm64EC()) { + SDValue ParamPtr = StackPtr; + if (IsTailCall) { + // Create a dummy object at the top of the stack that can be used to get + // the SP after the epilogue + int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true); + ParamPtr = DAG.getFrameIndex(FI, PtrVT); + } + // For vararg calls, the Arm64EC ABI requires values in x4 and x5 // describing the argument list. x4 contains the address of the // first stack parameter. x5 contains the size in bytes of all parameters // passed on the stack. - RegsToPass.emplace_back(AArch64::X4, StackPtr); + RegsToPass.emplace_back(AArch64::X4, ParamPtr); RegsToPass.emplace_back(AArch64::X5, DAG.getConstant(NumBytes, DL, MVT::i64)); } @@ -8151,9 +8149,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Handle result values, copying them out of physregs into vregs that we // return. - SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs, - DL, DAG, InVals, IsThisReturn, - IsThisReturn ? OutVals[0] : SDValue()); + SDValue Result = LowerCallResult( + Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn, + IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange); if (!Ins.empty()) InGlue = Result.getValue(Result->getNumValues() - 1); @@ -10702,6 +10700,14 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( parseConstraintCode(Constraint) != AArch64CC::Invalid) return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); + if (Constraint == "{za}") { + return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass); + } + + if (Constraint == "{zt0}") { + return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass); + } + // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; @@ -24403,7 +24409,8 @@ void AArch64TargetLowering::ReplaceBITCASTResults( return; } - if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1) + if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && + !VT.isVector()) return replaceBoolVectorBitcast(N, Results, DAG); if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16)) @@ -26899,7 +26906,7 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( return false; // If the vector is scalable, SVE is enabled, implying support for complex - // numbers. Otherwirse, we need to ensure complex number support is avaialble + // numbers. Otherwise, we need to ensure complex number support is available if (!VTy->isScalableTy() && !Subtarget->hasComplxNum()) return false; @@ -26915,7 +26922,7 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( !llvm::isPowerOf2_32(VTyWidth)) return false; - if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2()) { + if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) { unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); return 8 <= ScalarWidth && ScalarWidth <= 64; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 6505931e17e18d..74d0c4bde8dd2e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -58,6 +58,8 @@ enum NodeType : unsigned { CALL_BTI, // Function call followed by a BTI instruction. + COALESCER_BARRIER, + SMSTART, SMSTOP, RESTORE_ZA, @@ -999,6 +1001,9 @@ class AArch64TargetLowering : public TargetLowering { /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; + llvm::BumpPtrAllocator BumpAlloc; + llvm::StringSaver Saver{BumpAlloc}; + bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT); @@ -1026,7 +1031,7 @@ class AArch64TargetLowering : public TargetLowering { const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, - SDValue ThisVal) const; + SDValue ThisVal, bool RequiresSMChange) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 2e8d8c63d6bec2..9b4bb7c88bc821 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4098,16 +4098,6 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { return MI.getOperand(Idx); } -const MachineOperand & -AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case AArch64::LDRBBroX: - return MI.getOperand(4); - } -} - static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) @@ -9597,9 +9587,13 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, // Update liveins. if (MF.getRegInfo().reservedRegsFrozen()) { - recomputeLiveIns(*LoopTestMBB); - recomputeLiveIns(*LoopBodyMBB); - recomputeLiveIns(*ExitMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*ExitMBB) || + recomputeLiveIns(*LoopBodyMBB) || + recomputeLiveIns(*LoopTestMBB); + } while (anyChange); + ; } return ExitMBB->begin(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index db24a19fe5f8e3..6526f6740747ab 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -111,9 +111,6 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { /// Returns the immediate offset operator of a load/store. static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); - /// Returns the shift amount operator of a load/store. - static const MachineOperand &getLdStAmountOp(const MachineInstr &MI); - /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 03baa7497615e3..ac61dd8745d4e6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4885,19 +4885,9 @@ defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))))), (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; -def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), - (v8i16 (add (sub (zext (v8i8 V64:$opA)), - (zext (v8i8 V64:$opB))), - (AArch64vashr v8i16:$src, (i32 15))))), - (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), (zext (extract_high_v16i8 (v16i8 V128:$opB)))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; -def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), - (v8i16 (add (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), - (zext (extract_high_v16i8 (v16i8 V128:$opB)))), - (AArch64vashr v8i16:$src, (i32 15))))), - (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), (zext (v4i16 V64:$opB))))), (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e90b8a8ca7acee..926a89466255ca 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -62,8 +62,6 @@ STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); -STATISTIC(NumConstOffsetFolded, - "Number of const offset of index address folded"); DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", "Controls which pairs are considered for renaming"); @@ -77,11 +75,6 @@ static cl::opt LdStLimit("aarch64-load-store-scan-limit", static cl::opt UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden); -// The LdStConstLimit limits how far we search for const offset instructions -// when we form index address load/store instructions. -static cl::opt LdStConstLimit("aarch64-load-store-const-scan-limit", - cl::init(10), cl::Hidden); - // Enable register renaming to find additional store pairing opportunities. static cl::opt EnableRenaming("aarch64-load-store-renaming", cl::init(true), cl::Hidden); @@ -178,13 +171,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit); - // Scan the instruction list to find a register assigned with a const - // value that can be combined with the current instruction (a load or store) - // using base addressing with writeback. Scan forwards. - MachineBasicBlock::iterator - findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit, - unsigned &Offset); - // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan backwards. @@ -196,19 +182,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); - bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI, - unsigned IndexReg, unsigned &Offset); - // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx); - MachineBasicBlock::iterator - mergeConstOffsetInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update, unsigned Offset, - int Scale); - // Find and merge zero store instructions. bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); @@ -221,9 +199,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and merge a base register updates before or after a ld/st instruction. bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); - // Find and merge a index ldr/st instructions into a base ld/st instruction. - bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale); - bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -506,16 +481,6 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { } } -static unsigned getBaseAddressOpcode(unsigned Opc) { - // TODO: Add more index address loads/stores. - switch (Opc) { - default: - llvm_unreachable("Opcode has no base address equivalent!"); - case AArch64::LDRBBroX: - return AArch64::LDRBBui; - } -} - static unsigned getPostIndexedOpcode(unsigned Opc) { switch (Opc) { default: @@ -757,20 +722,6 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) { } } -// Make sure this is a reg+reg Ld/St -static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) { - unsigned Opc = MI.getOpcode(); - switch (Opc) { - default: - return false; - // Scaled instructions. - // TODO: Add more index address loads/stores. - case AArch64::LDRBBroX: - Scale = 1; - return true; - } -} - static bool isRewritableImplicitDef(unsigned Opc) { switch (Opc) { default: @@ -2097,63 +2048,6 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, return NextI; } -MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update, - unsigned Offset, int Scale) { - assert((Update->getOpcode() == AArch64::MOVKWi) && - "Unexpected const mov instruction to merge!"); - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator NextI = next_nodbg(I, E); - MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E); - MachineInstr &MemMI = *I; - unsigned Mask = (1 << 12) * Scale - 1; - unsigned Low = Offset & Mask; - unsigned High = Offset - Low; - Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); - Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); - MachineInstrBuilder AddMIB, MemMIB; - - // Add IndexReg, BaseReg, High (the BaseReg may be SP) - AddMIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri)) - .addDef(IndexReg) - .addUse(BaseReg) - .addImm(High >> 12) // shifted value - .addImm(12); // shift 12 - (void)AddMIB; - // Ld/St DestReg, IndexReg, Imm12 - unsigned NewOpc = getBaseAddressOpcode(I->getOpcode()); - MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .add(getLdStRegOp(MemMI)) - .add(AArch64InstrInfo::getLdStOffsetOp(MemMI)) - .addImm(Low / Scale) - .setMemRefs(I->memoperands()) - .setMIFlags(I->mergeFlagsWith(*Update)); - (void)MemMIB; - - ++NumConstOffsetFolded; - LLVM_DEBUG(dbgs() << "Creating base address load/store.\n"); - LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); - LLVM_DEBUG(PrevI->print(dbgs())); - LLVM_DEBUG(dbgs() << " "); - LLVM_DEBUG(Update->print(dbgs())); - LLVM_DEBUG(dbgs() << " "); - LLVM_DEBUG(I->print(dbgs())); - LLVM_DEBUG(dbgs() << " with instruction:\n "); - LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs())); - LLVM_DEBUG(dbgs() << " "); - LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs())); - LLVM_DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - PrevI->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset) { @@ -2201,31 +2095,6 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, return false; } -bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI, - MachineInstr &MI, - unsigned IndexReg, - unsigned &Offset) { - // The update instruction source and destination register must be the - // same as the load/store index register. - if (MI.getOpcode() == AArch64::MOVKWi && - TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) { - - // movz + movk hold a large offset of a Ld/St instruction. - MachineBasicBlock::iterator B = MI.getParent()->begin(); - MachineBasicBlock::iterator MBBI = &MI; - MBBI = prev_nodbg(MBBI, B); - MachineInstr &MovzMI = *MBBI; - if (MovzMI.getOpcode() == AArch64::MOVZWi) { - unsigned Low = MovzMI.getOperand(1).getImm(); - unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm(); - Offset = High + Low; - // 12-bit optionally shifted immediates are legal for adds. - return Offset >> 24 == 0; - } - } - return false; -} - MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); @@ -2381,60 +2250,6 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; } -MachineBasicBlock::iterator -AArch64LoadStoreOpt::findMatchingConstOffsetBackward( - MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) { - MachineBasicBlock::iterator B = I->getParent()->begin(); - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr &MemMI = *I; - MachineBasicBlock::iterator MBBI = I; - - // If the load is the first instruction in the block, there's obviously - // not any matching load or store. - if (MBBI == B) - return E; - - // Make sure the IndexReg is killed and the shift amount is zero. - // TODO: Relex this restriction to extend, simplify processing now. - if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() || - !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() || - (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0)) - return E; - - Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); - - // Track which register units have been modified and used between the first - // insn (inclusive) and the second insn. - ModifiedRegUnits.clear(); - UsedRegUnits.clear(); - unsigned Count = 0; - do { - MBBI = prev_nodbg(MBBI, B); - MachineInstr &MI = *MBBI; - - // Don't count transient instructions towards the search limit since there - // may be different numbers of them if e.g. debug information is present. - if (!MI.isTransient()) - ++Count; - - // If we found a match, return it. - if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) { - return MBBI; - } - - // Update the status of what the instruction clobbered and used. - LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); - - // Otherwise, if the index register is used or modified, we have no match, - // so return early. - if (!ModifiedRegUnits.available(IndexReg) || - !UsedRegUnits.available(IndexReg)) - return E; - - } while (MBBI != B && Count < Limit); - return E; -} - bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; @@ -2619,34 +2434,6 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate return false; } -bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, - int Scale) { - MachineInstr &MI = *MBBI; - MachineBasicBlock::iterator E = MI.getParent()->end(); - MachineBasicBlock::iterator Update; - - // Don't know how to handle unscaled pre/post-index versions below, so bail. - if (TII->hasUnscaledLdStOffset(MI.getOpcode())) - return false; - - // Look back to try to find a const offset for index LdSt instruction. For - // example, - // mov x8, #LargeImm ; = a * (1<<12) + imm12 - // ldr x1, [x0, x8] - // merged into: - // add x8, x0, a * (1<<12) - // ldr x1, [x8, imm12] - unsigned Offset; - Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset); - if (Update != E && (Offset & (Scale - 1)) == 0) { - // Merge the imm12 into the ld/st. - MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale); - return true; - } - - return false; -} - bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt) { @@ -2725,22 +2512,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; } - // 5) Find a register assigned with a const value that can be combined with - // into the load or store. e.g., - // mov x8, #LargeImm ; = a * (1<<12) + imm12 - // ldr x1, [x0, x8] - // ; becomes - // add x8, x0, a * (1<<12) - // ldr x1, [x8, imm12] - for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MBBI != E;) { - int Scale; - if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale)) - Modified = true; - else - ++MBBI; - } - return Modified; } diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 1e12cf545fa777..37d621cd2f6580 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -23,11 +23,13 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Object/COFF.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; +using namespace llvm::object; extern cl::opt EnableAArch64ELFLocalDynamicTLSGeneration; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index ea9882160d6fb2..48e1c1bc73022c 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -507,6 +507,10 @@ bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF, MCRegisterInfo::regsOverlap(PhysReg, AArch64::X16)) return true; + // ZA/ZT0 registers are reserved but may be permitted in the clobber list. + if (PhysReg == AArch64::ZA || PhysReg == AArch64::ZT0) + return true; + return !isReservedReg(MF, PhysReg); } @@ -1015,6 +1019,8 @@ bool AArch64RegisterInfo::shouldCoalesce( MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { + MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); + if (MI->isCopy() && ((DstRC->getID() == AArch64::GPR64RegClassID) || (DstRC->getID() == AArch64::GPR64commonRegClassID)) && @@ -1023,5 +1029,38 @@ bool AArch64RegisterInfo::shouldCoalesce( // which implements a 32 to 64 bit zero extension // which relies on the upper 32 bits being zeroed. return false; + + auto IsCoalescerBarrier = [](const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::COALESCER_BARRIER_FPR16: + case AArch64::COALESCER_BARRIER_FPR32: + case AArch64::COALESCER_BARRIER_FPR64: + case AArch64::COALESCER_BARRIER_FPR128: + return true; + default: + return false; + } + }; + + // For calls that temporarily have to toggle streaming mode as part of the + // call-sequence, we need to be more careful when coalescing copy instructions + // so that we don't end up coalescing the NEON/FP result or argument register + // with a whole Z-register, such that after coalescing the register allocator + // will try to spill/reload the entire Z register. + // + // We do this by checking if the node has any defs/uses that are + // COALESCER_BARRIER pseudos. These are 'nops' in practice, but they exist to + // instruct the coalescer to avoid coalescing the copy. + if (MI->isCopy() && SubReg != DstSubReg && + (AArch64::ZPRRegClass.hasSubClassEq(DstRC) || + AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) { + unsigned SrcReg = MI->getOperand(1).getReg(); + if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier)) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier)) + return false; + } + return true; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index eeae5303a3f898..acf067f2cc5a9d 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -28,6 +28,8 @@ def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2, def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; +def AArch64CoalescerBarrier + : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>; //===----------------------------------------------------------------------===// // Instruction naming conventions. @@ -189,6 +191,26 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), (MSR 0xde85, GPR64:$val)>; def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), (MRS 0xde85)>; + +multiclass CoalescerBarrierPseudo vts> { + def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> { + let Constraints = "$dst = $src"; + } + foreach vt = vts in { + def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$src))), + (!cast(NAME) rc:$src)>; + } +} + +multiclass CoalescerBarriers { + defm _FPR16 : CoalescerBarrierPseudo; + defm _FPR32 : CoalescerBarrierPseudo; + defm _FPR64 : CoalescerBarrierPseudo; + defm _FPR128 : CoalescerBarrierPseudo; +} + +defm COALESCER_BARRIER : CoalescerBarriers; + } // End let Predicates = [HasSME] // Pseudo to match to smstart/smstop. This expands: diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td index 3e4168f5f445f5..c714bad92b7fbb 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -29,7 +29,7 @@ def CortexA53Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index 277ec772cf0f10..ebbc3b72b50609 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -34,7 +34,7 @@ def CortexA57Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td index 7edce4b61605d2..d6fe84a2c9c9b4 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td @@ -22,7 +22,8 @@ def A64FXModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, SVEUnsupported.F, [HasMTE, HasMatMulInt8, HasBF16, - HasPAuth, HasPAuthLR, HasCPA]); + HasPAuth, HasPAuthLR, HasCPA, + HasCSSC]); let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td new file mode 100644 index 00000000000000..9c4f000cf351b2 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td @@ -0,0 +1,1149 @@ +//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Ampere Computing Ampere-1B to +// support instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// The Ampere-1B core is an out-of-order micro-architecture. The front +// end has branch prediction, with a 10-cycle recovery time from a +// mispredicted branch. Instructions coming out of the front end are +// decoded into internal micro-ops (uops). + +def Ampere1BModel : SchedMachineModel { + let IssueWidth = 12; // Maximum micro-ops dispatch rate. + let MicroOpBufferSize = 192; // micro-op re-order buffer size + let LoadLatency = 3; // Optimistic load latency + let MispredictPenalty = 10; // Branch mispredict penalty + let LoopMicroOpBufferSize = 32; // Instruction queue size + let CompleteModel = 1; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F, + PAUnsupported.F); +} + +let SchedModel = Ampere1BModel in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Ampere-1B. + +def Ampere1BUnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w +def Ampere1BUnitB : ProcResource<2>; // integer single-cycle, and complex shifts +def Ampere1BUnitBS : ProcResource<1>; // integer multi-cycle +def Ampere1BUnitL : ProcResource<2>; // load +def Ampere1BUnitS : ProcResource<2>; // store address calculation +def Ampere1BUnitX : ProcResource<1>; // FP and vector operations, and flag write +def Ampere1BUnitY : ProcResource<1>; // FP and vector operations, and crypto +def Ampere1BUnitZ : ProcResource<1>; // FP store data and FP-to-integer moves + +def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>; +def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>; + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Ampere-1. + +def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB, + Ampere1BUnitS, + Ampere1BUnitAB]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1BWrite_2cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS, + Ampere1BUnitZ, + Ampere1BUnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, + Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_2L : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY, + Ampere1BUnitS, + Ampere1BUnitZ]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def Ampere1BWrite_4cyc_3S_3Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitS, Ampere1BUnitZ, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL, + Ampere1BUnitBS]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL, + Ampere1BUnitL, + Ampere1BUnitL]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL, + Ampere1BUnitL, + Ampere1BUnitL, + Ampere1BUnitL]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA, + Ampere1BUnitA]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1BWrite_6cyc_1L_1XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1BWrite_6cyc_2L_2XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1BWrite_7cyc_1X_1Z : SchedWriteRes<[Ampere1BUnitX, Ampere1BUnitZ]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1BWrite_7cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitL, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 7; + let NumMicroOps = 6; +} + +def Ampere1BWrite_7cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 7; + let NumMicroOps = 8; +} + +def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 7; + let NumMicroOps = 12; +} + +def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1BWrite_8cyc_2L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 5; +} + +def Ampere1BWrite_8cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitL, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def Ampere1BWrite_8cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitS, Ampere1BUnitS, + Ampere1BUnitZ, Ampere1BUnitZ, + Ampere1BUnitZ, Ampere1BUnitZ]> { + let Latency = 9; + let NumMicroOps = 14; +} + +def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1BWrite_9cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitL, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 9; + let NumMicroOps = 1; +} + +def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1BWrite_10cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 10; + let NumMicroOps = 12; +} + +def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1BWrite_11cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitL, Ampere1BUnitL, + Ampere1BUnitXY, Ampere1BUnitXY, + Ampere1BUnitXY, Ampere1BUnitXY]> { + let Latency = 11; + let NumMicroOps = 12; +} + +def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 12; + let NumMicroOps = 1; +} + +def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> { + let Latency = 13; + let NumMicroOps = 2; +} + +def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 17; + let NumMicroOps = 1; +} + +def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS, + Ampere1BUnitBS, + Ampere1BUnitX]> { + let Latency = 13; + let NumMicroOps = 3; +} + +def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 19; + let NumMicroOps = 1; +} + +def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 21; + let NumMicroOps = 1; +} + +def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 33; + let NumMicroOps = 1; +} + +def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 39; + let NumMicroOps = 1; +} + +def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { + let Latency = 63; + let NumMicroOps = 1; +} + +// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), +// which are a single uop, and for extended registers, which have full flexibility +// across Unit A or B for both uops. +def Ampere1BWrite_Arith : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latencies for Ampere-1. +// This provides a coarse model, which is then specialised below. + +def : WriteRes; // MOVN, MOVZ +def : WriteRes; // ALU +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Shifted-Reg +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Extended-Reg +def : WriteRes; // EXTR shifts a reg pair +def : WriteRes; // Shift/Scale +def : WriteRes { + let Latency = 13; +} // 32-bit Divide +def : WriteRes { + let Latency = 19; +} // 64-bit Divide +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes { + let Latency = 3; +} // 64-bit Multiply +def : WriteRes; +def : WriteRes; +def : WriteRes { + let Latency = 3; +} // Load from base addr plus immediate offset +def : WriteRes { + let Latency = 1; +} // Store to base addr plus immediate offset +def : WriteRes { + let Latency = 1; + let NumMicroOps = 1; +} // Store a register pair. +def : WriteRes; +def : WriteRes { + let Latency = 3; + let NumMicroOps = 1; +} // Load from a register index (maybe scaled). +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store to a register index (maybe scaled). +def : WriteRes { + let Latency = 2; +} // General floating-point ops. +def : WriteRes { + let Latency = 3; +} // Floating-point compare. +def : WriteRes { + let Latency = 3; +} // Float conversion. +def : WriteRes { +} // Float-int register copy. +def : WriteRes { + let Latency = 2; +} // Float-int register copy. +def : WriteRes { + let Latency = 4; +} // Floating-point multiply. +def : WriteRes { + let Latency = 19; +} // Floating-point division. +def : WriteRes { + let Latency = 3; +} // 64bit Vector D ops. +def : WriteRes { + let Latency = 3; +} // 128bit Vector Q ops. +def : WriteRes { + let Latency = 4; +} // Vector loads. +def : WriteRes { + let Latency = 2; +} // Vector stores. + +def : WriteRes { let Unsupported = 1; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 3; +} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP + +// Forwarding logic. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Specialising the scheduling model further for Ampere-1B. + +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>; + +// Branch instructions +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>; + +// Common Short Sequence Compression (CSSC) +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instregex "^ABS[WX]")>; +def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CNT[WX]")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "^CTZ[WX]")>; +def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instregex "^[SU](MAX|MIN)[WX]")>; + +// Cryptography instructions +// -- AES encryption/decryption +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>; +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>; +// -- Polynomial multiplication +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; +// -- SHA-256 hash +def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>; +// -- SHA-256 schedule update +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>; +// -- SHA-3 instructions +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; +// -- SHA-512 hash +def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>; +// -- SHA-512 schedule update +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>; +// -- SHA1 choose/majority/parity +def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>; +// -- SHA1 hash/schedule update +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>; +def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>; +// -- SM3 hash +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$")>; +def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>; + +// FP and vector load instructions +// -- Load 1-element structure to one/all lanes +// ---- all lanes +def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], + (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// ---- one lane +def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], + (instregex "^LD1i(8|16|32|64)")>; +// -- Load 1-element structure to one/all lanes, 1D size +def : InstRW<[Ampere1BWrite_4cyc_1L], + (instregex "^LD1Rv1d")>; +// -- Load 1-element structures to 1 register +def : InstRW<[Ampere1BWrite_4cyc_1L], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 2 registers +def : InstRW<[Ampere1BWrite_4cyc_2L], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 3 registers +def : InstRW<[Ampere1BWrite_5cyc_3L], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 4 registers +def : InstRW<[Ampere1BWrite_5cyc_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 2-element structure to all lanes of 2 registers, 1D size +def : InstRW<[Ampere1BWrite_4cyc_2L], + (instregex "^LD2Rv1d")>; +// -- Load 2-element structure to all lanes of 2 registers, other sizes +def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], + (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 2-element structure to one lane of 2 registers +def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], + (instregex "^LD2i(8|16|32|64)")>; +// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size +def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], + (instregex "^LD2Twov(16b|8h|4s|2d)")>; +// -- Load 2-element structures to 2 registers, 8B/4H/2S size +def : InstRW<[Ampere1BWrite_8cyc_2L_3XY], + (instregex "^LD2Twov(8b|4h|2s)")>; +// -- Load 3-element structure to all lanes of 3 registers, 1D size +def : InstRW<[Ampere1BWrite_5cyc_3L], + (instregex "^LD3Rv1d")>; +// -- Load 3-element structure to all lanes of 3 registers, other sizes +def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 3-element structure to one lane of 3 registers +def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], + (instregex "^LD3i(8|16|32|64)")>; +// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1BWrite_8cyc_3L_3XY], + (instregex "^LD3Threev(16b|8h|4s)")>; +// -- Load 3-element structures to 3 registers, 2D size +def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], + (instregex "^LD3Threev2d")>; +// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1BWrite_9cyc_3L_3XY], + (instregex "^LD3Threev(8b|4h|2s)")>; +// -- Load 4-element structure to all lanes of 4 registers, 1D size +def : InstRW<[Ampere1BWrite_5cyc_4L], + (instregex "^LD4Rv1d")>; +// -- Load 4-element structure to all lanes of 4 registers, other sizes +def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 4-element structure to one lane of 4 registers +def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], + (instregex "^LD4i(8|16|32|64)")>; +// -- Load 4-element structures to 4 registers, 2D size +def : InstRW<[Ampere1BWrite_8cyc_4L_4XY], + (instregex "^LD4Fourv2d")>; +// -- Load 4-element structures to 4 registers, 2S size +def : InstRW<[Ampere1BWrite_11cyc_4L_8XY], + (instregex "^LD4Fourv2s")>; +// -- Load 4-element structures to 4 registers, other sizes +def : InstRW<[Ampere1BWrite_10cyc_4L_8XY], + (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; +// -- Load pair, Q-form +def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>; +// -- Load pair, S/D-form +def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; +// -- Load register +def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>; +// -- Load register, sign-extended register +def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; + +// FP and vector store instructions +// -- Store 1-element structure from one lane of 1 register +def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z], + (instregex "^ST1i(8|16|32|64)")>; +// -- Store 1-element structures from 1 register +def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 2 registers +def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 3 registers +def : InstRW<[Ampere1BWrite_4cyc_3S_3Z], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 4 registers +def : InstRW<[Ampere1BWrite_5cyc_4S_4Z], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 2-element structure from one lane of 2 registers +def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], + (instregex "^ST2i(8|16|32|64)")>; +// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes +def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], + (instregex "^ST2Twov(16b|8h|4s|2d)")>; +// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z], + (instregex "^ST2Twov(8b|4h|2s)")>; +// -- Store 3-element structure from one lane of 3 registers +def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], + (instregex "^ST3i(8|16|32|64)")>; +// -- Store 3-element structures from 3 registers +def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], + (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 4-element structure from one lane of 4 registers +def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], + (instregex "^ST4i(8|16|32|64)")>; +// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv(16b|8h|4s)")>; +// -- Store 4-element structures from 4 registers, 2D sizes +def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv2d")>; +// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z], + (instregex "^ST4Fourv(8b|4h|2s)")>; +// -- Store pair, Q-form +def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>; +// -- Store pair, S/D-form +def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>; +// -- Store register +def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; +// -- Store register, sign-extended register offset +def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; + +// FP data processing, bfloat16 format +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>; +def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>; +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>; + +// FP data processing, scalar/vector, half precision +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; +def : InstRW<[Ampere1BWrite_3cyc_1X], + (instregex "^FCMPE?H")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], + (instregex "^FCCMPE?H")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], + (instregex "^FCSELH")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; +// Convert FP to integer, H-form +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>; +// Convert to FP from GPR, H-form +def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>; +// Convert to FP from GPR, fixed-point, H-form +def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>; +def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>; +def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; +def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; +// FP square root, H-form +def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>; +// FP square root, vector-form, F16 +def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>; + +// FP data processing, scalar/vector, single/double precision +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1X], + (instregex "^FCMPE?(S|D)")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], + (instregex "^FCCMPE?(S|D)")>; +def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], + (instregex "^FCSEL(S|D)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; +// Convert FP to integer, S/D-form +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>; +// Convert to FP from GPR, S/D-form +def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>; +// Convert to FP from GPR, fixed-point, S/D-form +def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>; +def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>; +def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULSrr, FNMULSrr)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULDrr, FNMULDrr)>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>; +def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>; +def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>; + +// FP miscellaneous instructions +def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>; +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>; +def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>; +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; +def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>; +def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; + +// Integer arithmetic and logical instructions +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "ADC(W|X)r", "SBC(W|X)r")>; +def : InstRW<[Ampere1BWrite_Arith], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>; +def : InstRW<[Ampere1BWrite_1cyc_1AB], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>; +def : InstRW<[Ampere1BWrite_ArithFlagsetting], + (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(ADC|SBC)S[WX]r")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(CCMN|CCMP)(X|W)")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], + (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; +def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>; +def : InstRW<[Ampere1BWrite_3cyc_1BS], + (instregex "(S|U)MULHr")>; +def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB], + (instregex "(S|U)?M(ADD|SUB)L?r")>; + +// Integer load instructions +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "(LDNP|LDP|LDPSW)(X|W)")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDR(B|D|H|Q|S)ui")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDR(D|Q|W|X)l")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDTR(B|H|W|X)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDURS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1BWrite_3cyc_1L], + (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; +def : InstRW<[Ampere1BWrite_1cyc_1L], + (instrs PRFMl, PRFUMi, PRFUMi)>; +def : InstRW<[Ampere1BWrite_1cyc_1L], + (instrs PRFMroW, PRFMroX)>; + +// Integer miscellaneous instructions +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs ADR, ADRP)>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "EXTR(W|X)")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; +def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "CLS(W|X)")>; +def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs SETF8, SETF16)>; +def : InstRW<[Ampere1BWrite_1cyc_1AB], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; +def : InstRW<[Ampere1BWrite_1cyc_1B], + (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], + (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; + +// Integer store instructions +def : InstRW<[Ampere1BWrite_1cyc_2S], (instregex "STNP(X|W)i")>; +def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STPXi)>; +def : InstRW<[Ampere1BWrite_2cyc_1B_1S], (instrs STPWi)>; +def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>; +def : InstRW<[Ampere1BWrite_1cyc_1S], (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; +def : InstRW<[Ampere1BWrite_1cyc_1S], (instregex "STUR(BB|HH|X|W)i", + "STR(X|W)ui", + "STUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroX, STRXroX)>; +def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroW, STRXroW)>; + +// Memory tagging + +// Insert Random Tags +def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>; +// Load allocation tag +def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>; +// Store allocation tags +def : InstRW<[Ampere1BWrite_1cyc_1S], + (instrs STGi, STGM, STGPreIndex, STGPostIndex)>; +// Store allocation tags and pair of registers +def : InstRW<[Ampere1BWrite_1cyc_2S], + (instrs STGPi, STGPpre, STGPpost)>; +// Store allocation tags and zero data +def : InstRW<[Ampere1BWrite_1cyc_1S], + (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>; +// Store two tags +def : InstRW<[Ampere1BWrite_1cyc_2S], + (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>; +// Store two tags and zero data +def : InstRW<[Ampere1BWrite_1cyc_2S], + (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>; +// Subtract Pointer +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>; +// Subtract Pointer, flagset +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>; +// Insert Tag Mask +def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>; +// Arithmetic, immediate to logical address tag +def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs ADDG, SUBG)>; + +// Pointer authentication +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>; +def : InstRW<[Ampere1BWrite_6cyc_1BS_1A], + (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; +def : InstRW<[Ampere1BWrite_6cyc_1BS_2A], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>; +def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>; +def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>; + +// Vector integer instructions +// -- absolute difference +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", + "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; +// -- arithmetic +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", + "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", + "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; +// -- arithmetic, horizontal, 16B +def : InstRW<[Ampere1BWrite_8cyc_4XY], + (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; +def : InstRW<[Ampere1BWrite_8cyc_4XY], + (instregex "^[SU](MIN|MAX)Vv16i8v")>; +// -- arithmetic, horizontal, 4H/4S +def : InstRW<[Ampere1BWrite_4cyc_2XY], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; +def : InstRW<[Ampere1BWrite_4cyc_2XY], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; +// -- arithmetic, horizontal, 8B/8H +def : InstRW<[Ampere1BWrite_6cyc_3XY], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; +def : InstRW<[Ampere1BWrite_6cyc_3XY], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; +// -- arithmetic, narrowing +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; +// -- arithmetic, pairwise +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; +// -- arithmetic, saturating +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; +// -- bit count +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^(CLS|CLZ|CNT)v")>; +// -- compare +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", + "^CMHIv", "^CMHSv")>; +// -- compare non-zero +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>; +// -- dot product +def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; +// -- fp reciprocal estimate +def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>; +// -- integer reciprocal estimate +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; +// -- logical +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; +// -- logical, narrowing +def : InstRW<[Ampere1BWrite_6cyc_2XY], + (instregex "RSHRNv", + "SHRNv", "SQSHRNv", "SQSHRUNv", + "UQXTNv")>; +// -- matrix multiply +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instrs SMMLA, UMMLA, USMMLA)>; +// -- max/min +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +// -- move immediate +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; +// -- multiply +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; +// -- multiply accumulate +def : InstRW<[Ampere1BWrite_3cyc_1XY], + (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; +// -- negation, saturating +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; +// -- reverse bits/bytes +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; +// -- shift +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// -- shift and accumulate +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; +// -- shift, saturating +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", + "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", + "^UQSHL")>; + +// Vector miscellaneous instructions +// -- duplicate element +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>; +// -- duplicate from GPR +def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>; +// -- extract narrow +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>; +// -- insert/extract element +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; +// -- move FP immediate +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>; +// -- move element to GPR +def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>; +// -- move from GPR to any element +def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; +// -- table lookup +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[Ampere1BWrite_4cyc_2XY], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[Ampere1BWrite_6cyc_3XY], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[Ampere1BWrite_8cyc_4XY], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; +// -- transpose +def : InstRW<[Ampere1BWrite_2cyc_1XY], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; +// -- zip/unzip +def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; + +} // SchedModel = Ampere1BModel diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 1ef3a2a063382d..48324654949c06 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -21,7 +21,7 @@ def CycloneModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index 2127a34a58d513..6fc4ec3ae41b77 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -27,7 +27,7 @@ def ExynosM3Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index 83cf56088d4ced..5163de280f2e4f 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -27,7 +27,7 @@ def ExynosM4Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index 85058af86decb5..2ccbe1614dcd79 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -27,7 +27,7 @@ def ExynosM5Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index a765cd1cdfe347..e9172e82b099d1 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -26,7 +26,7 @@ def FalkorModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td index 3551066ee7c35d..258b34c38898cd 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td @@ -30,7 +30,7 @@ def KryoModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td index 2ec9600f84f7e5..524fa33f498bb0 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td @@ -25,7 +25,7 @@ def NeoverseN1Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(PAUnsupported.F, SMEUnsupported.F, SVEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td index a6fab5e6245f80..8ec124954362f8 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td @@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel { let CompleteModel = 1; list UnsupportedFeatures = !listconcat(SMEUnsupported.F, - [HasSVE2p1, HasPAuthLR, HasCPA]); + [HasSVE2p1, HasPAuthLR, HasCPA, HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td index 75fbb85dce9d14..613db353cb0aaa 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td @@ -28,7 +28,8 @@ def NeoverseV1Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVE2Unsupported.F, SMEUnsupported.F, - [HasMTE, HasCPA]); + [HasMTE, HasCPA, + HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index 658d7cdd23a63b..e7de40fdf1deb0 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -22,7 +22,8 @@ def NeoverseV2Model : SchedMachineModel { let CompleteModel = 1; list UnsupportedFeatures = !listconcat(SMEUnsupported.F, - [HasSVE2p1, HasCPA]); + [HasSVE2p1, HasCPA, + HasCSSC]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 9e5060f1f36496..0ae9a69fd48265 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -27,7 +27,7 @@ def TSV110Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); } // Define each kind of processor resource and number available on the TSV110, diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td index e1536f208e448a..8df3f56e45738c 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -28,7 +28,7 @@ def ThunderXT8XModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index 89faa92155e00d..ef4baa3dedff93 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -28,7 +28,7 @@ def ThunderX2T99Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td index 8685554b00d76d..796bd4b8b5c9ae 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td @@ -27,7 +27,7 @@ def ThunderX3T110Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE]); + [HasMTE, HasCSSC]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index e3a0606331db1c..dd4c0e2eb64249 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -296,6 +296,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { break; case Ampere1: case Ampere1A: + case Ampere1B: CacheLineSize = 64; PrefFunctionAlignment = Align(64); PrefLoopAlignment = Align(64); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 16864102df59b0..f8dcbe97b6321a 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -42,6 +42,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { A64FX, Ampere1, Ampere1A, + Ampere1B, AppleA7, AppleA10, AppleA11, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index d611338fc268f9..992b11da7eeee5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -233,15 +233,20 @@ static bool hasPossibleIncompatibleOps(const Function *F) { bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { - SMEAttrs CallerAttrs(*Caller); - SMEAttrs CalleeAttrs(*Callee); + SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); + + // When inlining, we should consider the body of the function, not the + // interface. + if (CalleeAttrs.hasStreamingBody()) { + CalleeAttrs.set(SMEAttrs::SM_Compatible, false); + CalleeAttrs.set(SMEAttrs::SM_Enabled, true); + } + if (CalleeAttrs.hasNewZABody()) return false; if (CallerAttrs.requiresLazySave(CalleeAttrs) || - (CallerAttrs.requiresSMChange(CalleeAttrs) && - (!CallerAttrs.hasStreamingInterfaceOrBody() || - !CalleeAttrs.hasStreamingBody()))) { + CallerAttrs.requiresSMChange(CalleeAttrs)) { if (hasPossibleIncompatibleOps(Callee)) return false; } @@ -4062,4 +4067,4 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { cast(I->getNextNode())->isUnconditional()) return true; return BaseT::shouldTreatInstructionLikeSelect(I); -} \ No newline at end of file +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index fd69a7d6c33d03..4b9d549e791142 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -622,9 +622,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .lowerIf([=](const LegalityQuery &Query) { LLT DstTy = Query.Types[0]; LLT SrcTy = Query.Types[1]; - return DstTy.isVector() && (SrcTy.getSizeInBits() > 128 || - (DstTy.getScalarSizeInBits() * 2 < - SrcTy.getScalarSizeInBits())); + return DstTy.isVector() && SrcTy.getSizeInBits() > 128 && + DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits(); }) .alwaysLegal(); diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 10e69655f77e10..8b32d593d2a812 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -248,34 +248,6 @@ static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) { return false; } -static inline std::optional -getArm64ECMangledFunctionName(std::string Name) { - bool IsCppFn = Name[0] == '?'; - if (IsCppFn && Name.find("$$h") != std::string::npos) - return std::nullopt; - if (!IsCppFn && Name[0] == '#') - return std::nullopt; - - StringRef Prefix = "$$h"; - size_t InsertIdx = 0; - if (IsCppFn) { - InsertIdx = Name.find("@@"); - size_t ThreeAtSignsIdx = Name.find("@@@"); - if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) { - InsertIdx += 2; - } else { - InsertIdx = Name.find("@"); - if (InsertIdx != std::string::npos) - InsertIdx++; - } - } else { - Prefix = "#"; - } - - Name.insert(Name.begin() + InsertIdx, Prefix.begin(), Prefix.end()); - return std::optional(Name); -} - namespace AArch64CC { // The CondCodes constants map directly to the 4-bit encoding of the condition diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index cb29d5d9475981..250e3e350c02e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1506,6 +1506,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, + FeatureFP8ConversionInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index a19b03b9292337..152f495a452ba2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_wmmavisrc : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamodsf16Neg : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_wmmamodsf16NegAbs : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_swmmacindex8 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_swmmacindex16 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 4c35649cec6c8a..4f7bf3f7d35e71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, return true; } +static MachineSDNode *buildRegSequence32(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL) { + unsigned DstRegClass; + EVT DstTy; + switch (Elts.size()) { + case 8: + DstRegClass = AMDGPU::VReg_256RegClassID; + DstTy = MVT::v8i32; + break; + case 4: + DstRegClass = AMDGPU::VReg_128RegClassID; + DstTy = MVT::v4i32; + break; + case 2: + DstRegClass = AMDGPU::VReg_64RegClassID; + DstTy = MVT::v2i32; + break; + default: + llvm_unreachable("unhandled Reg sequence size"); + } + + SmallVector Ops; + Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32)); + for (unsigned i = 0; i < Elts.size(); ++i) { + Ops.push_back(Elts[i]); + Ops.push_back(CurDAG->getTargetConstant( + SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32)); + } + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops); +} + +static MachineSDNode *buildRegSequence16(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL) { + SmallVector PackedElts; + assert("unhandled Reg sequence size" && + (Elts.size() == 8 || Elts.size() == 16)); + + // Pack 16-bit elements in pairs into 32-bit register. If both elements are + // unpacked from 32-bit source use it, otherwise pack them using v_perm. + for (unsigned i = 0; i < Elts.size(); i += 2) { + SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i])); + SDValue HiSrc; + if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) { + PackedElts.push_back(HiSrc); + } else { + SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32); + MachineSDNode *Packed = + CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32, + {Elts[i + 1], Elts[i], PackLoLo}); + PackedElts.push_back(SDValue(Packed, 0)); + } + } + + return buildRegSequence32(PackedElts, CurDAG, DL); +} + +static MachineSDNode *buildRegSequence(SmallVectorImpl &Elts, + llvm::SelectionDAG *CurDAG, + const SDLoc &DL, unsigned ElementSize) { + if (ElementSize == 16) + return buildRegSequence16(Elts, CurDAG, DL); + if (ElementSize == 32) + return buildRegSequence32(Elts, CurDAG, DL); + llvm_unreachable("Unhandled element size"); +} + +static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, + SmallVectorImpl &Elts, SDValue &Src, + llvm::SelectionDAG *CurDAG, const SDLoc &DL, + unsigned ElementSize) { + if (ModOpcode == ISD::FNEG) { + Mods |= SISrcMods::NEG; + // Check if all elements also have abs modifier + SmallVector NegAbsElts; + for (auto El : Elts) { + if (El.getOpcode() != ISD::FABS) + break; + NegAbsElts.push_back(El->getOperand(0)); + } + if (Elts.size() != NegAbsElts.size()) { + // Neg + Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); + } else { + // Neg and Abs + Mods |= SISrcMods::NEG_HI; + Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0); + } + } else { + assert(ModOpcode == ISD::FABS); + // Abs + Mods |= SISrcMods::NEG_HI; + Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); + } +} + +// Check all f16 elements for modifiers while looking through b32 and v2b16 +// build vector, stop if element does not satisfy ModifierCheck. +static void +checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, + std::function ModifierCheck) { + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + if (auto *F16Pair = + dyn_cast(stripBitcast(BV->getOperand(i)))) { + for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) { + SDValue ElF16 = stripBitcast(F16Pair->getOperand(i)); + if (!ModifierCheck(ElF16)) + break; + } + } + } +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + + // mods are on f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsF16; + + checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool { + if (Element.getOpcode() != ISD::FNEG) + return false; + EltsF16.push_back(Element.getOperand(0)); + return true; + }); + + // All elements have neg modifier + if (BV->getNumOperands() * 2 == EltsF16.size()) { + Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0); + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + } + } + + // mods are on v2f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsV2F16; + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (ElV2f16.getOpcode() != ISD::FNEG) + break; + EltsV2F16.push_back(ElV2f16.getOperand(0)); + } + + // All pairs of elements have neg modifier + if (BV->getNumOperands() == EltsV2F16.size()) { + Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0); + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + } + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + + // mods are on f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsF16; + checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool { + // Based on first element decide which mod we match, neg or abs + if (EltsF16.empty()) + ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElF16.getOpcode() != ModOpcode) + return false; + EltsF16.push_back(ElF16.getOperand(0)); + return true; + }); + + // All elements have ModOpcode modifier + if (BV->getNumOperands() * 2 == EltsF16.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In), + 16); + } + + // mods are on v2f16 elements + if (auto *BV = dyn_cast(stripBitcast(In))) { + SmallVector EltsV2F16; + + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsV2F16.empty()) + ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElV2f16->getOpcode() != ModOpcode) + break; + EltsV2F16.push_back(ElV2f16->getOperand(0)); + } + + // All elements have ModOpcode modifier + if (BV->getNumOperands() == EltsV2F16.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In), + 32); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsF32; + + if (auto *BV = dyn_cast(stripBitcast(In))) { + for (unsigned i = 0; i < BV->getNumOperands(); ++i) { + SDValue ElF32 = stripBitcast(BV->getOperand(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsF32.empty()) + ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; + if (ElF32.getOpcode() != ModOpcode) + break; + EltsF32.push_back(ElF32.getOperand(0)); + } + + // All elements had ModOpcode modifier + if (BV->getNumOperands() == EltsF32.size()) + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In), + 32); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const { + if (auto *BV = dyn_cast(In)) { + BitVector UndefElements; + if (SDValue Splat = BV->getSplatValue(&UndefElements)) + if (isInlineImmediate(Splat.getNode())) { + if (const ConstantSDNode *C = dyn_cast(Splat)) { + unsigned Imm = C->getAPIntValue().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + if (const ConstantFPSDNode *C = dyn_cast(Splat)) { + unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + llvm_unreachable("unhandled Constant node"); + } + } + + // 16 bit splat + SDValue SplatSrc32 = stripBitcast(In); + if (auto *SplatSrc32BV = dyn_cast(SplatSrc32)) { + if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) { + SDValue SplatSrc16 = stripBitcast(Splat32); + if (auto *SplatSrc16BV = dyn_cast(SplatSrc16)) { + if (SDValue Splat = SplatSrc16BV->getSplatValue()) { + + // f16 + if (isInlineImmediate(Splat.getNode())) { + const ConstantFPSDNode *C = dyn_cast(Splat); + int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16); + return true; + } + + // bf16 + if (const ConstantSDNode *C = dyn_cast(Splat)) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + APInt BF16Value = C->getAPIntValue(); + APInt F32Value = BF16Value.zext(32).shl(16); + if (TII->isInlineConstant(F32Value)) { + int64_t Imm = F32Value.getSExtValue(); + Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); + return true; + } + } + } + } + } + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + if (In.getOpcode() == ISD::SRL) { + const llvm::SDValue &ShiftSrc = In.getOperand(0); + ConstantSDNode *ShiftAmt = dyn_cast(In.getOperand(1)); + if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && + ShiftAmt->getZExtValue() % 8 == 0) { + Key = ShiftAmt->getZExtValue() / 8; + Src = ShiftSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + if (In.getOpcode() == ISD::SRL) { + const llvm::SDValue &ShiftSrc = In.getOperand(0); + ConstantSDNode *ShiftAmt = dyn_cast(In.getOperand(1)); + if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && + ShiftAmt->getZExtValue() == 16) { + Key = 1; + Src = ShiftSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 8645490f0b16f1..3b42d88df0c246 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -240,6 +240,16 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; + bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectWMMAVISrc(SDValue In, SDValue &Src) const; + + bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 55d95154c75878..2af53a664ff173 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -577,6 +577,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN}); setMaxAtomicSizeInBitsSupported(64); + setMaxDivRemBitWidthSupported(64); } bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index fdee74d58d2691..f255d098b631c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( }}; } +static Register buildRegSequence(SmallVectorImpl &Elts, + MachineInstr *InsertPt, + MachineRegisterInfo &MRI) { + const TargetRegisterClass *DstRegClass; + switch (Elts.size()) { + case 8: + DstRegClass = &AMDGPU::VReg_256RegClass; + break; + case 4: + DstRegClass = &AMDGPU::VReg_128RegClass; + break; + case 2: + DstRegClass = &AMDGPU::VReg_64RegClass; + break; + default: + llvm_unreachable("unhandled Reg sequence size"); + } + + MachineIRBuilder B(*InsertPt); + auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE) + .addDef(MRI.createVirtualRegister(DstRegClass)); + for (unsigned i = 0; i < Elts.size(); ++i) { + MIB.addReg(Elts[i]); + MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i)); + } + return MIB->getOperand(0).getReg(); +} + +static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, + SmallVectorImpl &Elts, Register &Src, + MachineInstr *InsertPt, + MachineRegisterInfo &MRI) { + if (ModOpcode == TargetOpcode::G_FNEG) { + Mods |= SISrcMods::NEG; + // Check if all elements also have abs modifier + SmallVector NegAbsElts; + for (auto El : Elts) { + Register FabsSrc; + if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc)))) + break; + NegAbsElts.push_back(FabsSrc); + } + if (Elts.size() != NegAbsElts.size()) { + // Neg + Src = buildRegSequence(Elts, InsertPt, MRI); + } else { + // Neg and Abs + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(NegAbsElts, InsertPt, MRI); + } + } else { + assert(ModOpcode == TargetOpcode::G_FABS); + // Abs + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(Elts, InsertPt, MRI); + } +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsF32; + + if (GBuildVector *BV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < BV->getNumSources(); ++i) { + MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsF32.empty()) + ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG + : AMDGPU::G_FABS; + if (ElF32->getOpcode() != ModOpcode) + break; + EltsF32.push_back(ElF32->getOperand(1).getReg()); + } + + // All elements had ModOpcode modifier + if (BV->getNumSources() == EltsF32.size()) { + selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(), + *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + SmallVector EltsV2F16; + + if (GConcatVectors *CV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < CV->getNumSources(); ++i) { + Register FNegSrc; + if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc)))) + break; + EltsV2F16.push_back(FNegSrc); + } + + // All elements had ModOpcode modifier + if (CV->getNumSources() == EltsV2F16.size()) { + Mods |= SISrcMods::NEG; + Mods |= SISrcMods::NEG_HI; + Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const { + Register Src = Root.getReg(); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned ModOpcode; + SmallVector EltsV2F16; + + if (GConcatVectors *CV = dyn_cast(MRI->getVRegDef(Src))) { + for (unsigned i = 0; i < CV->getNumSources(); ++i) { + MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i)); + // Based on first element decide which mod we match, neg or abs + if (EltsV2F16.empty()) + ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG + : AMDGPU::G_FABS; + if (ElV2F16->getOpcode() != ModOpcode) + break; + EltsV2F16.push_back(ElV2F16->getOperand(1).getReg()); + } + + // All elements had ModOpcode modifier + if (CV->getNumSources() == EltsV2F16.size()) { + MachineIRBuilder B(*Root.getParent()); + selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(), + *MRI); + } + } + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { + std::optional FPValReg; + if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) { + if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) { + return {{[=](MachineInstrBuilder &MIB) { + MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue()); + }}}; + } + // Non-inlineable splat floats should not fall-through for integer immediate + // checks. + return {}; + } + + APInt ICst; + if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) { + if (TII.isInlineConstant(ICst)) { + return { + {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}}; + } + } + + return {}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const { + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && + MRI->getType(ShiftSrc).getSizeInBits() == 32 && + ShiftAmt->Value.getZExtValue() % 8 == 0) { + Key = ShiftAmt->Value.getZExtValue() / 8; + Src = ShiftSrc; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { + + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && + MRI->getType(ShiftSrc).getSizeInBits() == 32 && + ShiftAmt->Value.getZExtValue() == 16) { + Src = ShiftSrc; + Key = 1; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 12ea46c2895b04..ef7630f137aca6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -199,6 +199,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF32NegAbs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF16Neg(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAModsF16NegAbs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectWMMAVISrc(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex8(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex16(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8e74d4c0e94592..17ffb7ec988f0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4178,10 +4178,45 @@ bool AMDGPULegalizerInfo::loadInputValue( Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); - const ArgDescriptor *Arg; + const ArgDescriptor *Arg = nullptr; const TargetRegisterClass *ArgRC; LLT ArgTy; - std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); + + CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); + const ArgDescriptor WorkGroupIDX = + ArgDescriptor::createRegister(AMDGPU::TTMP9); + // If GridZ is not programmed in an entry function then the hardware will set + // it to all zeros, so there is no need to mask the GridY value in the low + // order bits. + const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( + AMDGPU::TTMP7, + AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); + const ArgDescriptor WorkGroupIDZ = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); + if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { + switch (ArgType) { + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + Arg = &WorkGroupIDX; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + Arg = &WorkGroupIDY; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + Arg = &WorkGroupIDZ; + ArgRC = &AMDGPU::SReg_32RegClass; + ArgTy = LLT::scalar(32); + break; + default: + break; + } + } + + if (!Arg) + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); if (!Arg) { if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { @@ -6848,6 +6883,21 @@ bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI, + MachineIRBuilder &B) const { + // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. + if (!ST.hasArchitectedSGPRs()) + return false; + LLT S32 = LLT::scalar(32); + Register DstReg = MI.getOperand(0).getReg(); + auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8)); + auto LSB = B.buildConstant(S32, 25); + auto Width = B.buildConstant(S32, 5); + B.buildUbfx(DstReg, TTMP8, LSB, Width); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -6970,6 +7020,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_workgroup_id_z: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_wave_id: + return legalizeWaveID(MI, B); case Intrinsic::amdgcn_lds_kernel_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::LDS_KERNEL_ID); @@ -7134,6 +7186,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { + Register Index = MI.getOperand(5).getReg(); + LLT S32 = LLT::scalar(32); + if (MRI.getType(Index) != S32) + MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); + return true; + } + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { + Register Index = MI.getOperand(7).getReg(); + LLT S32 = LLT::scalar(32); + if (MRI.getType(Index) != S32) + MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); + return true; + } case Intrinsic::amdgcn_fmed3: { GISelChangeObserver &Observer = Helper.Observer; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 56aabd4f6ab71b..ecbe42681c6690 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -212,6 +212,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeImageIntrinsic( MachineInstr &MI, MachineIRBuilder &B, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 5e73411cae9b70..c1b244f50d93f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -521,10 +521,18 @@ static Value *promoteAllocaUserToVector( // For memset, we don't need to know the previous value because we // currently only allow memsets that cover the whole alloca. Value *Elt = MSI->getOperand(1); - if (DL.getTypeStoreSize(VecEltTy) > 1) { - Value *EltBytes = - Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt); - Elt = Builder.CreateBitCast(EltBytes, VecEltTy); + const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy); + if (BytesPerElt > 1) { + Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt); + + // If the element type of the vector is a pointer, we need to first cast + // to an integer, then use a PtrCast. + if (VecEltTy->isPointerTy()) { + Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8); + Elt = Builder.CreateBitCast(EltBytes, PtrInt); + Elt = Builder.CreateIntToPtr(Elt, VecEltTy); + } else + Elt = Builder.CreateBitCast(EltBytes, VecEltTy); } return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index bdd4e891f15899..09fac963d222d4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: + case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 67263f23b98314..bb1c6b73372999 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -414,6 +414,22 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; // The dummy boolean output is divergent from the IR's perspective, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 489cf85693edb2..9ab657f4e7bb4f 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -151,6 +151,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyOpSelHi, ImmTyNegLo, ImmTyNegHi, + ImmTyIndexKey8bit, + ImmTyIndexKey16bit, ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, @@ -383,6 +385,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } bool isCPol() const { return isImmTy(ImmTyCPol); } + bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); } + bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); } bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); } @@ -656,6 +660,14 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isVISrcF16() || isVISrcB32(); } + bool isVISrc_64F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16); + } + + bool isVISrc_64B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); + } + bool isVISrc_64B64() const { return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64); } @@ -672,6 +684,14 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); } + bool isVISrc_256B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32); + } + + bool isVISrc_256F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32); + } + bool isVISrc_256B64() const { return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64); } @@ -1047,6 +1067,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyOffset1: OS << "Offset1"; break; case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break; case ImmTyCPol: OS << "CPol"; break; + case ImmTyIndexKey8bit: OS << "index_key"; break; + case ImmTyIndexKey16bit: OS << "index_key"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -1604,6 +1626,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser { ParseStatus parseRegWithFPInputMods(OperandVector &Operands); ParseStatus parseRegWithIntInputMods(OperandVector &Operands); ParseStatus parseVReg32OrOff(OperandVector &Operands); + ParseStatus tryParseIndexKey(OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy); + ParseStatus parseIndexKey8bit(OperandVector &Operands); + ParseStatus parseIndexKey16bit(OperandVector &Operands); + ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, @@ -1784,6 +1811,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); + void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands); + void cvtVOPD(MCInst &Inst, const OperandVector &Operands); void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); @@ -3500,6 +3529,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { return !isInlineConstant(Inst, OpIdx); } else if (MO.isReg()) { auto Reg = MO.getReg(); + if (!Reg) { + return false; + } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); auto PReg = mc2PseudoReg(Reg); return isSGPR(PReg, TRI) && PReg != SGPR_NULL; @@ -4364,7 +4396,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) { uint64_t TSFlags = MII.get(Opc).TSFlags; // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2) - if (!(TSFlags & SIInstrFlags::IsDOT)) + // v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1) + // v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1) + // other wmma/swmmac instructions don't have neg_lo/neg_hi operand. + if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) && + !(TSFlags & SIInstrFlags::IsSWMMAC)) return true; int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName); @@ -6465,6 +6501,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref, return true; } +ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy) { + const char *Pref = "index_key"; + int64_t ImmVal = 0; + SMLoc Loc = getLoc(); + auto Res = parseIntWithPrefix(Pref, ImmVal); + if (!Res.isSuccess()) + return Res; + + if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1)) + return Error(Loc, Twine("out of range ", StringRef(Pref))); + + if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3)) + return Error(Loc, Twine("out of range ", StringRef(Pref))); + + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy)); + return ParseStatus::Success; +} + +ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit); +} + +ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -8303,12 +8366,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi || - Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) { + Opc == AMDGPU::V_CVT_SR_FP8_F32_vi || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) { Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods Inst.addOperand(Inst.getOperand(0)); } - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) { + // Adding vdst_in operand is already covered for these DPP instructions in + // cvtVOP3DPP. + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) && + !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) { assert(!IsPacked); Inst.addOperand(Inst.getOperand(0)); } @@ -8329,10 +8400,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, } int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); - if (NegLoIdx != -1) { + if (NegLoIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); + + int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + if (NegHiIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); - } const int Ops[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, @@ -8352,11 +8425,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, if (OpSelHiIdx != -1) OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); - if (NegLoIdx != -1) { - int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + if (NegLoIdx != -1) NegLo = Inst.getOperand(NegLoIdx).getImm(); + + if (NegHiIdx != -1) NegHi = Inst.getOperand(NegHiIdx).getImm(); - } for (int J = 0; J < 3; ++J) { int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); @@ -8392,6 +8465,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { cvtVOP3P(Inst, Operands, OptIdx); } +static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands, + unsigned i, unsigned Opc, unsigned OpName) { + if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1) + ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2); + else + ((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1); +} + +void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) { + unsigned Opc = Inst.getOpcode(); + + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); + addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers); + addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers); + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef + ((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2 + + OptionalImmIndexMap OptIdx; + for (unsigned i = 5; i < Operands.size(); ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + OptIdx[Op.getImmTy()] = i; + } + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey8bit); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey16bit); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI); + + cvtVOP3P(Inst, Operands, OptIdx); +} + //===----------------------------------------------------------------------===// // VOPD //===----------------------------------------------------------------------===// @@ -8770,6 +8880,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, } } + int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + if (VdstInIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(Inst.getOperand(0)); + } + + bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12; + if (IsVOP3CvtSrDpp) { + if (Src2ModIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(0)); + } + } + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); if (TiedTo != -1) { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 86096b0d80b424..a9968cfe25b46b 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64) DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) @@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, break; Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS); + if (Res) + break; + + Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS); } while (false); if (Res && AMDGPU::isMAC(MI.getOpcode())) { @@ -712,6 +720,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, AMDGPU::OpName::src2_modifiers); } + if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) { + // Insert dummy unused src2_modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src2_modifiers); + } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && !AMDGPU::hasGDS(STI)) { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds); @@ -942,6 +957,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { // first add optional MI operands to check FI DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); + if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { convertVOP3PDPPInst(MI); } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || @@ -951,6 +967,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { @@ -977,6 +1002,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index b6e4e65ff5b03b..08bef7ad3002c6 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { } bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { - if (!SIInstrInfo::isWMMA(*MI)) + if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) return false; const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) { - if (!SIInstrInfo::isWMMA(I)) + auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { + if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) return false; // Src0 or Src1 of the current wmma instruction overlaps with the dest of @@ -1753,6 +1753,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { const MachineOperand *Src2Mods = TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers); const bool NoSrc2Mods = + !Src2Mods || (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0; // Exception: there is no hazard if the wmma instructions are of the same // type and there is no input modifier on src2 of the current instruction. @@ -1760,6 +1761,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { TII->pseudoToMCOpcode(MI->getOpcode()))); } + // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) + // but Index can't overlap with PrevDstReg. + if (AMDGPU::isGFX12Plus(ST)) { + if (SIInstrInfo::isSWMMAC(*MI)) { + const Register CurIndex = + TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(PrevDstReg, CurIndex)) + return true; + } + return false; + } + return false; }; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index e73e53aa270f91..abfa4a3531e8e1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } + // Print three values of neg/opsel for wmma instructions (prints 0 when there + // is no src_modifier operand instead of not printing anything). + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC || + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) { + NumOps = 0; + int DefaultValue = Mod == SISrcMods::OP_SEL_1; + for (int OpName : + {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (Idx != -1) + Ops[NumOps++] = MI->getOperand(Idx).getImm(); + else + Ops[NumOps++] = DefaultValue; + } + } + const bool HasDstSel = NumOps > 0 && Mod == SISrcMods::OP_SEL_0 && @@ -1305,6 +1322,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Opc = MI->getOpcode(); + if (isCvt_F32_Fp8_Bf8_e64(Opc)) { + auto SrcMod = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + unsigned Mod = MI->getOperand(SrcMod).getImm(); + unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0); + unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1); + if (Index0 || Index1) + O << " op_sel:[" << Index0 << ',' << Index1 << ']'; + return; + } if (isPermlane16(Opc)) { auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); @@ -1336,6 +1363,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); } +void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + +void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index e3958f88277da8..e91ff86b219a0c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -139,6 +139,10 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O); void printNegHi(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey8bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey16bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 8ab66d4fd5b861..19596d53b45328 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -167,6 +167,9 @@ enum : uint64_t { // ds_gws_* instructions. GWS = UINT64_C(1) << 62, + + // Is a SWMMAC instruction. + IsSWMMAC = UINT64_C(1) << 63, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2862a7787e75a3..a812cdc61500cc 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { assert(Old.isReg() && Fold.isImm()); if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || + (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) || (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index cf947dccafac55..d6bf0d8cb2efa8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2072,11 +2072,45 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT, AMDGPUFunctionArgInfo::PreloadedValue PVID) const { - const ArgDescriptor *Reg; + const ArgDescriptor *Reg = nullptr; const TargetRegisterClass *RC; LLT Ty; - std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); + CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); + const ArgDescriptor WorkGroupIDX = + ArgDescriptor::createRegister(AMDGPU::TTMP9); + // If GridZ is not programmed in an entry function then the hardware will set + // it to all zeros, so there is no need to mask the GridY value in the low + // order bits. + const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( + AMDGPU::TTMP7, + AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); + const ArgDescriptor WorkGroupIDZ = + ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); + if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { + switch (PVID) { + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + Reg = &WorkGroupIDX; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + Reg = &WorkGroupIDY; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + Reg = &WorkGroupIDZ; + RC = &AMDGPU::SReg_32RegClass; + Ty = LLT::scalar(32); + break; + default: + break; + } + } + + if (!Reg) + std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); if (!Reg) { if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { // It's possible for a kernarg intrinsic call to appear in a kernel with @@ -2505,28 +2539,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, } } - if (Info.hasWorkGroupIDX()) { - Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (!HasArchitectedSGPRs) { + if (Info.hasWorkGroupIDX()) { + Register Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDY()) { - Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (Info.hasWorkGroupIDY()) { + Register Reg = Info.addWorkGroupIDY(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDZ()) { - Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs); - if (!HasArchitectedSGPRs) + if (Info.hasWorkGroupIDZ()) { + Register Reg = Info.addWorkGroupIDZ(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); - - CCInfo.AllocateReg(Reg); + CCInfo.AllocateReg(Reg); + } } if (Info.hasWorkGroupInfo()) { @@ -7890,6 +7920,17 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, return Loads[0]; } +SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { + // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. + if (!Subtarget->hasArchitectedSGPRs()) + return {}; + SDLoc SL(Op); + MVT VT = MVT::i32; + SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT); + return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8, + DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT)); +} + SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim, const ArgDescriptor &Arg) const { @@ -8060,6 +8101,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_workgroup_id_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_wave_id: + return lowerWaveID(DAG, Op); case Intrinsic::amdgcn_lds_kernel_id: { if (MFI->isEntryFunction()) return getLDSKernelId(DAG, DL); @@ -8242,6 +8285,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; } + case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { + if (Op.getOperand(4).getValueType() == MVT::i32) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), IndexKeyi32); + } + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { + if (Op.getOperand(6).getValueType() == MVT::i32) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKeyi32, Op.getOperand(7)}); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index d66ba0b59ba906..e436c23af5bcac 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -80,6 +80,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const; + SDValue lowerWaveID(SelectionDAG &DAG, SDValue Op) const; SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim, const ArgDescriptor &ArgDesc) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 1b66d163714fbc..ab536f8f49d537 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -161,6 +161,9 @@ class InstSI ; def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; +def IndexKey16bit : CustomOperand; +def IndexKey8bit : CustomOperand; + def dpp8 : CustomOperand; def dpp_ctrl : CustomOperand; @@ -1344,6 +1347,13 @@ def VOP3PModsDOT : ComplexPattern; def VOP3PModsNeg : ComplexPattern; def WMMAOpSelVOP3PMods : ComplexPattern; +def WMMAModsF32NegAbs : ComplexPattern; +def WMMAModsF16Neg : ComplexPattern; +def WMMAModsF16NegAbs : ComplexPattern; +def WMMAVISrc : ComplexPattern; +def SWMMACIndex8 : ComplexPattern; +def SWMMACIndex16 : ComplexPattern; + def VOP3OpSel : ComplexPattern; def VOP3OpSelMods : ComplexPattern; @@ -1684,8 +1694,9 @@ class getIns64 _ArgVT, bit _EnableClamp = 0> { field bit IsDOT = 0; field bit IsSingle = 0; field bit IsWMMA = 0; + field bit IsSWMMAC = 0; + + field bit IsFP8 = 0; field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 9ff66a094f991f..0336ec4985ea74 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -751,35 +751,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, } // Add system SGPRs. - Register addWorkGroupIDX(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR(); - ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDX() { + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDX.getRegister(); } - Register addWorkGroupIDY(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); - unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u; - ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDY() { + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDY.getRegister(); } - Register addWorkGroupIDZ(bool HasArchitectedSGPRs) { - Register Reg = - HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); - unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u; - ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask); - if (!HasArchitectedSGPRs) - NumSystemSGPRs += 1; - + Register addWorkGroupIDZ() { + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); + NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDZ.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 84b9330ef9633e..50d8bfa8750818 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2358,6 +2358,11 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; + if (IsNonTemporal) { + // Set non-temporal hint for all cache levels. + Changed |= setTH(MI, AMDGPU::CPol::TH_NT); + } + if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); @@ -2370,11 +2375,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( Position::AFTER); } - if (IsNonTemporal) { - // Set non-temporal hint for all cache levels. - Changed |= setTH(MI, AMDGPU::CPol::TH_NT); - } - return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index f42af89cf5e6d3..b3265b73fa7e11 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">; // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// +def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">; +def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">; def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">; +def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">; def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">; def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">; +def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">; +def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">; def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">; def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">; def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 0bf9452d822e97..106fdb19f27895 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -529,6 +529,17 @@ bool isPermlane16(unsigned Opc) { Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; } +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { + return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12; +} + bool isGenericAtomic(unsigned Opc) { return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index d3f55c79201747..11b0bc5c81711e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -535,6 +535,9 @@ bool isPermlane16(unsigned Opc); LLVM_READNONE bool isGenericAtomic(unsigned Opc); +LLVM_READNONE +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc); + namespace VOPD { enum Component : unsigned { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 95a1d86963473a..ef652fce65482c 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -571,6 +571,7 @@ let SubtargetPredicate = isGFX9Only in { } // End SubtargetPredicate = isGFX9Only class VOPProfile_Base_CVT_F32_F8 : VOPProfileI2F { + let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; let HasExt = 1; @@ -599,6 +600,7 @@ class Cvt_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { let OtherPredicates = [HasCvtFP8VOP1Bug] in { def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; @@ -617,6 +619,7 @@ foreach Index = [1, 2, 3] in { def : Cvt_F32_F8_Pat; def : Cvt_F32_F8_Pat; } +} // End SubtargetPredicate = isGFX9Only class Cvt_PK_F32_F8_Pat : GCNPat< @@ -626,11 +629,77 @@ class Cvt_PK_F32_F8_Pat; -foreach Index = [0, -1] in { - def : Cvt_PK_F32_F8_Pat; - def : Cvt_PK_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat; + def : Cvt_PK_F32_F8_Pat; + } +} + + +// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions. +def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F { + let HasOpSel = 1; + let HasExtVOP3DPP = 0; +} + +def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> { + let HasOpSel = 1; + let HasExtDPP = 1; + let HasExtVOP3DPP = 1; + let IsFP8 = 1; + let HasClamp = 0; + let HasOMod = 0; + let HasModifiers = 1; + let Src1VOP3DPP = Src1RC64; +} + +let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0, + SchedRW = [WriteFloatCvt] in { + defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; + defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; +} + +class Cvt_F32_F8_Pat_OpSel index, + VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< + (f32 (node i32:$src, index)), + !if (index, + (inst_e64 !if(index{0}, + !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), + SRCMODS.OP_SEL_0), + !if(index{1}, SRCMODS.OP_SEL_1, 0)), + $src, 0), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, 1, 2, 3] in { + def : Cvt_F32_F8_Pat_OpSel; + def : Cvt_F32_F8_Pat_OpSel; + } +} + +class Cvt_PK_F32_F8_Pat_OpSel : GCNPat< + (v2f32 (node i32:$src, index)), + !if (index, + (inst_e64 SRCMODS.OP_SEL_0, $src, 0, 0, SRCMODS.NONE), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat_OpSel; + def : Cvt_PK_F32_F8_Pat_OpSel; + } } let SubtargetPredicate = isGFX10Plus in { @@ -853,6 +922,20 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name op, VOP3_Real_with_name; +// Define VOP1 instructions using the pseudo instruction with its old profile and +// VOP3 using the OpSel profile for the pseudo instruction. +defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">; +defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">; +defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name; + +defm V_CVT_PK_F32_BF8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_BF8 : VOP3_Real_with_name; + defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c, "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">; defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 713b4712d563c0..14db5221021489 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -520,8 +520,26 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile { let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0, FP32InputMods:$src1_modifiers, Src1RC64:$src1, VGPR_32:$vdst_in, op_sel0:$op_sel); + let InsVOP3DPP = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); + let HasClamp = 0; - let HasExtVOP3DPP = 0; + let HasExtVOP3DPP = 1; } def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, @@ -530,14 +548,36 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, FP32InputMods:$src1_modifiers, Src1RC64:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, op_sel0:$op_sel); + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); let HasClamp = 0; let HasSrc2 = 0; let HasSrc2Mods = 1; + let HasExtVOP3DPP = 1; + let HasOpSel = 1; let AsmVOP3OpSel = !subst(", $src2_modifiers", "", getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret); - let HasExtVOP3DPP = 0; + let AsmVOP3DPP16 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP16.ret>.ret); + let AsmVOP3DPP8 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP8.ret>.ret); } def IsPow2Plus1: PatLeaf<(i32 imm), [{ @@ -618,13 +658,13 @@ let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0, class Cvt_PK_F8_F32_Pat : GCNPat< (i32 (node f32:$src0, f32:$src1, i32:$old, index)), - (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0)) + (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0) >; class Cvt_SR_F8_F32_Pat index, VOP3_Pseudo inst> : GCNPat< (i32 (node f32:$src0, i32:$src1, i32:$old, index)), (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, - !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0)) + !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0) >; foreach Index = [0, -1] in { @@ -998,6 +1038,11 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>; defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>; +defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>; +defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>; +defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>; + //===----------------------------------------------------------------------===// // GFX11, GFX12 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 0c7a08cd4bc91f..107b95a9ca8eb0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -936,16 +936,19 @@ multiclass WMMAInst(NAME # _threeaddr # Suffix)>; } - if !eq(Type, WMMAOpSel) then { - def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; - } else if !eq(Type, WMMAUIClamp) then { - def : WMMAUIClampPat(NAME # _twoaddr # Suffix), node, P>; - } else { - def : WMMARegularPat(NAME # _twoaddr # Suffix), node, P>; + let SubtargetPredicate = isGFX11Only in { + if !eq(Type, WMMAOpSel) then { + def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; + } else if !eq(Type, WMMAUIClamp) then { + def : WMMAUIClampPat(NAME # _twoaddr # Suffix), node, P>; + } else { + def : WMMARegularPat(NAME # _twoaddr # Suffix), node, P>; + } } } + let WaveSizePredicate = isWave32 in { defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; @@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in { } +class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, + bit _IsIU, bit _IsFP8BF8> + : VOP3P_Profile> { + bit IsIU = _IsIU; + bit IsFP8BF8 = _IsFP8BF8; + bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8)); + + int IndexType = _IndexType; + + let IsPacked = 1; + let IsWMMA = !not(_IsSWMMAC); + let IsSWMMAC = _IsSWMMAC; + + bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP); + bit IsAB_BF16 = !and(IsF16BF16, isIntType.ret); + bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32)); + bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16)); + bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16)); + + bit NegLo01 = !or(IsF16BF16, IsIU); + bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegHi01 = IsF16BF16; + bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegLoAny = !or(NegLo01, NegLo2); + bit NegHiAny = !or(NegHi01, NegHi2); + + let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256, + !eq(ArgTy[0], v8i32): VDst_256, + !eq(ArgTy[0], v8f16): VDst_128, + !eq(ArgTy[0], v8i16): VDst_128, + !eq(ArgTy[0], v4f32): VDst_128, + !eq(ArgTy[0], v4i32): VDst_128, + !eq(ArgTy[0], v4f16): VDst_64, + !eq(ArgTy[0], v4i16): VDst_64); + let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128, + !eq(ArgTy[1], v4f16): VRegSrc_64, + !eq(ArgTy[1], v4i16): VRegSrc_64, + !eq(ArgTy[1], v8i16): VRegSrc_128, + !eq(ArgTy[1], v4i32): VRegSrc_128, + !eq(ArgTy[1], v2i32): VRegSrc_64, + !eq(ArgTy[1], i32) : VRegSrc_32); + let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256, + !eq(ArgTy[2], v16i16): VRegSrc_256, + !eq(ArgTy[2], v8f16): VRegSrc_128, + !eq(ArgTy[2], v8i16): VRegSrc_128, + !eq(ArgTy[2], v4i32): VRegSrc_128, + !eq(ArgTy[1], v4i16): VRegSrc_64, + !eq(ArgTy[1], v4f16): VRegSrc_64, + !eq(ArgTy[2], v2i32): VRegSrc_64, + !eq(ArgTy[2], i32) : VRegSrc_32); + let Src2RC64 = !if(IsSWMMAC, DstRC, + !cond(!eq(ArgTy[3], v8f32): VISrc_256_f32, + !eq(ArgTy[3], v8i32): VISrc_256_b32, + !eq(ArgTy[3], v8f16): VISrc_128_f16, + !eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16 + !eq(ArgTy[3], v4f16): VISrc_64_f16, + !eq(ArgTy[3], v4i16): VISrc_64_b32, + !eq(ArgTy[3], v4i32): VISrc_128_b32, + !eq(ArgTy[3], v4f32): VISrc_128_f32)); + + // For f16 and bf16 matrices A and B, each element can be modified by + // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is + // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext) + // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each + // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). + + // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index + // --------------------------------------------------------------------------- + // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32) + // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16) + // wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16) + // --------------------------------------------------------------------------- + // wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for + // | neg_lo = 1 i4/i8(sext) | i32 matrices + // --------------------------------------------------------------------------- + // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32) + // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix + // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix + // swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix + // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst + // --------------------------------------------------------------------------- + // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix + // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst + + // pseudo + + // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 + // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers, + // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32 + // f16 or bf16). swmmac use index_key and don't use src 2 modifiers. + + dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers)); + dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers)); + dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers)); + dag IndexKey = !cond(!eq(IndexType, 0) : (ins), + !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), + !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit)); + dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins)); + dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), + !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo), + !and(!not(NegLoAny), !not(NegHiAny)) : (ins)); + + let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1), + !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)), + IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)), + Clamp, Neg); + + // asm + + string IndexKeyAsm = !cond(!eq(IndexType, 0) : "", + !eq(IndexType, 8) : "$index_key_8bit", + !eq(IndexType, 16) : "$index_key_16bit"); + string ClampAsm = !if(IsIU, "$clamp", ""); + string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", + !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", + !and(!not(NegLoAny), !not(NegHiAny)) : ""); + + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm; + + // isel patterns + + dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), + IsAB_BF16 : (ins Src0VT:$src0), + IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsFP8BF8 : (ins Src0VT:$src0)); + dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_BF16 : (ins (i32 8), Src0VT:$src0), + IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), + IsFP8BF8 : (ins Src0VT:$src0)); + dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), + IsAB_BF16 : (ins Src1VT:$src1), + IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsFP8BF8 : (ins Src1VT:$src1)); + dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_BF16 : (ins (i32 8), Src1VT:$src1), + IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), + IsFP8BF8 : (ins Src1VT:$src1)); + dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_BF16 : (ins Src2VT:$src2), + IsIU : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_BF16 : (ins (i32 8), Src2VT:$src2), + IsIU : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins)); + dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), + !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))), + !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit)))); + dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), + !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), + !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit)); + dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2))); + dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2)); + + + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat); + + dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat); + dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat); + + // wmma pattern where src2 is inline imm uses _threeaddr pseudo, + // can't use _twoaddr since it would violate src2 tied to vdst constraint. + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat); +} + +multiclass WMMAInstGFX12 { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in + def _twoaddr : VOP3P_Pseudo{ + let PseudoInstr = Instr#PseudoInstrSuffix; + } + + let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in + def _threeaddr : VOP3P_Pseudo{ + let PseudoInstr = Instr#PseudoInstrSuffix; + } + + } + def : WMMAOpcodeMapping(NAME # _twoaddr), + !cast(NAME # _threeaddr)>; +} + +multiclass SWMMACInstGFX12 { + def _twoaddr : VOP3P_Pseudo{ + let Mnemonic = Instr; + let PseudoInstr = Instr#PseudoInstrSuffix; + let mayRaiseFPException = 0; + let ReadsModeReg = 0; + let AsmMatchConverter = "cvtSWMMAC"; + + let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; + } +} + +// First argument in Profile is types for matrices D, A, B and C (D = A * B + C) +// as used by llvm ir, types are vectors(with matrix elements) +// wave32: +// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts, +// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts. +// wave64: +// lanes will have half the size of elements in lanes compared to wave32 with +// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored + +// general idea on element distribution differences: +// wave32: lane n has 8 matrix elements +// wave64: lane n has first 4, lane n+32 has other 4 elements + +// index size, for each 2 elements in lane you need 4bits in index + +// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s. +// Original type for them is in comment on the right and refers to A and B. + +def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>; +def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>; +def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>; +def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>; +def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8 +def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4 +def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8 +def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4 + +def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>; +def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>; +def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>; +def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>; +def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8 +def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 * +def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8 +def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 + +def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>; +def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>; +def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>; +def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>; +def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8 +def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4 +def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 ** +def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8 + +def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>; +def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>; +def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>; +def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>; +def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8 +def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 *** +def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4 +def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8 + +// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored +// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used +// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored +// for matrix A, index is i16; Matrix B uses all lanes + +let WaveSizePredicate = isWave32 in { +defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">; +defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">; + +defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">; +} + +let WaveSizePredicate = isWave64 in { +defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">; +defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">; +defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">; +defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">; + +defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">; +defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">; +} + +// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that +// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0). +multiclass WMMAPat { + def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), + (P.DstVT !setdagop(P.WmmaOutPat, !cast(Inst#"_twoaddr")))>; + let AddedComplexity = 4 in + def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)), + (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast(Inst#"_threeaddr")))>; +} + +class SWMMACPat : + GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), + (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>; + +class SWMMACPat_w64 : + GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)), + (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{ + let WaveSizePredicate = isWave64; + } + +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { + defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; + defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>; + + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)), + (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; +} + +let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { + defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; + defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>; + defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>; + + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; +} + + //===----------------------------------------------------------------------===// // Begin Real Encodings //===----------------------------------------------------------------------===// @@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base op, string backing_ps_name = NAME VOP3Pe_gfx11_gfx12(backing_ps_name).Pfl>; } +class VOP3PeWmma op, VOPProfile P, VOP3PWMMA_Profile WMMAP> + : VOP3Pe_gfx11_gfx12{ + // opsel + let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0, + !eq(WMMAP.IndexType, 8) : index_key_8bit{0}, + !eq(WMMAP.IndexType, 16) : index_key_16bit{0}); + let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0); + let Inst{13} = 0; + // opsel_hi + let Inst{59} = 1; + let Inst{60} = 1; + let Inst{14} = 1; + // neg_lo + let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0); + let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0); + let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0); + // neg_hi + let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0); + let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0); + let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0); + // clamp + let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0); +} + +multiclass VOP3P_WMMA_Real_Base op, VOP3PWMMA_Profile WMMAP, + string backing_ps_name = NAME, + string asmName = !cast(NAME).Mnemonic> { + def Gen.Suffix : + VOP3P_Real_Gen(backing_ps_name), Gen, asmName>, + VOP3PeWmma(backing_ps_name).Pfl, WMMAP>; +} + +multiclass VOP3P_Real_WMMA_gfx12 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + +multiclass VOP3P_Real_WMMA_gfx12w64 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + +defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; +defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; +defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>; + +defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>; +defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>; +defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>; + + +defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>; +defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>; + +defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>; +defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>; +defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; + multiclass VOP3P_Real_with_name op, string backing_ps_name = NAME, string asmName = !cast(NAME).Mnemonic> { diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index df505c3365cbde..20d7c88fb7e59f 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -124,6 +124,7 @@ class VOP3_Pseudo pattern = [], let IsPacked = P.IsPacked; let IsMAI = P.IsMAI; let IsWMMA = P.IsWMMA; + let IsSWMMAC = P.IsSWMMAC; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, @@ -305,6 +306,11 @@ class VOP3OpSel_gfx10 op, VOPProfile p> : VOP3e_gfx10 { class VOP3OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx10; +class VOP3FP8OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3e_gfx10 { + let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0); +} + class VOP3DotOpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx11_gfx12{ let Inst{11} = ?; let Inst{12} = ?; @@ -378,6 +384,8 @@ class VOP3Pe op, VOPProfile P> : Enc64 { bits<4> src2_modifiers; bits<9> src2; bits<1> clamp; + bits<2> index_key_8bit; + bits<1> index_key_16bit; let Inst{7-0} = vdst; let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 @@ -738,7 +746,7 @@ class VOP3_DPPe_Common_Base op, VOPProfile P> : Enc96 { let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs. let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?); - let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?); + let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?); let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?); let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?); let Inst{15} = !if(P.HasClamp, clamp, 0); @@ -1406,14 +1414,20 @@ multiclass VOP3_Real_with_name op, string opName, defvar ps = !cast(opName#"_e64"); let AsmString = asmName # ps.AsmOperands, IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - if ps.Pfl.HasOpSel then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3OpSel_gfx11_gfx12; - if !not(ps.Pfl.HasOpSel) then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3e_gfx11_gfx12; + if ps.Pfl.IsFP8 then { + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3FP8OpSel_gfx11_gfx12; + } else { + if ps.Pfl.HasOpSel then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3OpSel_gfx11_gfx12; + if !not(ps.Pfl.HasOpSel) then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3e_gfx11_gfx12; + } } def Gen.Suffix#"_VOP3_alias" : MnemonicAlias, Requires<[Gen.AssemblerPredicate]>, LetDummies; } diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index eeb7f64aa5810e..9b54dd4e4e618d 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -2781,10 +2781,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, AFI->setLRIsSpilled(SavedRegs.test(ARM::LR)); } -void ARMFrameLowering::processFunctionBeforeFrameFinalized( - MachineFunction &MF, RegScavenger *RS) const { - TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS); - +void ARMFrameLowering::updateLRRestored(MachineFunction &MF) { MachineFrameInfo &MFI = MF.getFrameInfo(); if (!MFI.isCalleeSavedInfoValid()) return; @@ -2808,6 +2805,12 @@ void ARMFrameLowering::processFunctionBeforeFrameFinalized( } } +void ARMFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS); + updateLRRestored(MF); +} + void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF, BitVector &SavedRegs) const { TargetFrameLowering::getCalleeSaves(MF, SavedRegs); diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h index 8d2b8beb9a58fb..3c7358d8cd53e2 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -59,6 +59,10 @@ class ARMFrameLowering : public TargetFrameLowering { void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; + /// Update the IsRestored flag on LR if it is spilled, based on the return + /// instructions. + static void updateLRRestored(MachineFunction &MF); + void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index ed9d30c3c3ab90..6121055eb02176 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2062,17 +2062,6 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { MO.setReg(ARM::PC); PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI); MBB.erase(MBBI); - // We now restore LR into PC so it is not live-out of the return block - // anymore: Clear the CSI Restored bit. - MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo(); - // CSI should be fixed after PrologEpilog Insertion - assert(MFI.isCalleeSavedInfoValid() && "CSI should be valid"); - for (CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) { - if (Info.getReg() == ARM::LR) { - Info.setRestored(false); - break; - } - } return true; } } @@ -2120,14 +2109,22 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { isThumb2 = AFI->isThumb2Function(); isThumb1 = AFI->isThumbFunction() && !isThumb2; - bool Modified = false; + bool Modified = false, ModifiedLDMReturn = false; for (MachineBasicBlock &MBB : Fn) { Modified |= LoadStoreMultipleOpti(MBB); if (STI->hasV5TOps() && !AFI->shouldSignReturnAddress()) - Modified |= MergeReturnIntoLDM(MBB); + ModifiedLDMReturn |= MergeReturnIntoLDM(MBB); if (isThumb1) Modified |= CombineMovBx(MBB); } + Modified |= ModifiedLDMReturn; + + // If we merged a BX instruction into an LDM, we need to re-calculate whether + // LR is restored. This check needs to consider the whole function, not just + // the instruction(s) we changed, because there may be other BX returns which + // still need LR to be restored. + if (ModifiedLDMReturn) + ARMFrameLowering::updateLRRestored(Fn); Allocator.DestroyAll(); return Modified; diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 5c1c7046fdbff0..8629551152cb64 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1806,12 +1806,13 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); DFS.ProcessLoop(); const SmallVectorImpl &PostOrder = DFS.getOrder(); - for (auto *MBB : PostOrder) { - recomputeLiveIns(*MBB); - // FIXME: For some reason, the live-in print order is non-deterministic for - // our tests and I can't out why... So just sort them. - MBB->sortUniqueLiveIns(); - } + bool anyChange = false; + do { + anyChange = false; + for (auto *MBB : PostOrder) { + anyChange = recomputeLiveIns(*MBB) || anyChange; + } + } while (anyChange); for (auto *MBB : reverse(PostOrder)) recomputeLivenessFlags(*MBB); diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index efaaec32ee6bb1..0a77c7c1d418a1 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -1398,7 +1398,7 @@ let mayLoad = 1, hasSideEffects = 0, // Load indirect with displacement operations. let canFoldAsLoad = 1, isReMaterializable = 1 in { - let Constraints = "@earlyclobber $reg" in def LDDRdPtrQ + def LDDRdPtrQ : FSTDLDD<0, (outs GPR8 : $reg), diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 76c1a14fe0156c..907aae13d6de0c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2343,7 +2343,9 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode( LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy), - DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(ValBits == 32 ? (MaskIdx0 + (MaskLen0 & 31) - 1) + : (MaskIdx0 + MaskLen0 - 1), + DL, GRLenVT), DAG.getConstant(MaskIdx0, DL, GRLenVT)); } @@ -4940,3 +4942,8 @@ bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const { return !isa(Y); } + +ISD::NodeType LoongArchTargetLowering::getExtendForAtomicCmpSwapArg() const { + // TODO: LAMCAS will use amcas{_DB,}.[bhwd] which does not require extension. + return ISD::SIGN_EXTEND; +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 72182623b2c3dd..9e9ac0b8269291 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -206,6 +206,8 @@ class LoongArchTargetLowering : public TargetLowering { return ISD::SIGN_EXTEND; } + ISD::NodeType getExtendForAtomicCmpSwapArg() const override; + Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h index 7d39d47e86b363..fa9bc7608e7d2c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h @@ -45,6 +45,11 @@ class LoongArchTargetMachine : public LLVMTargetMachine { MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; + + // Addrspacecasts are always noops. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + return true; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index 04349aa52b5408..d47dded9ea6ecf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -21,17 +21,20 @@ using namespace llvm; TypeSize LoongArchTTIImpl::getRegisterBitWidth( TargetTransformInfo::RegisterKind K) const { + TypeSize DefSize = TargetTransformInfoImplBase::getRegisterBitWidth(K); switch (K) { case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); case TargetTransformInfo::RGK_FixedWidthVector: - if (ST->hasExtLASX() && ST->hasExpAutoVec()) + if (!ST->hasExpAutoVec()) + return DefSize; + if (ST->hasExtLASX()) return TypeSize::getFixed(256); - if (ST->hasExtLSX() && ST->hasExpAutoVec()) + if (ST->hasExtLSX()) return TypeSize::getFixed(128); - return TypeSize::getFixed(0); + [[fallthrough]]; case TargetTransformInfo::RGK_ScalableVector: - return TypeSize::getScalable(0); + return DefSize; } llvm_unreachable("Unsupported register kind"); diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 3c673ae938fdec..9d6e8dc573a8d1 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -150,6 +150,7 @@ class MipsAsmParser : public MCTargetAsmParser { bool IsLittleEndian; bool IsPicEnabled; bool IsCpRestoreSet; + bool CurForbiddenSlotAttr; int CpRestoreOffset; unsigned GPReg; unsigned CpSaveLocation; @@ -552,6 +553,7 @@ class MipsAsmParser : public MCTargetAsmParser { CurrentFn = nullptr; + CurForbiddenSlotAttr = false; IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent(); IsCpRestoreSet = false; @@ -723,6 +725,16 @@ class MipsAsmParser : public MCTargetAsmParser { return getSTI().hasFeature(Mips::FeatureGINV); } + bool hasForbiddenSlot(const MCInstrDesc &MCID) const { + return !inMicroMipsMode() && (MCID.TSFlags & MipsII::HasForbiddenSlot); + } + + bool SafeInForbiddenSlot(const MCInstrDesc &MCID) const { + return !(MCID.TSFlags & MipsII::IsCTI); + } + + void onEndOfFile() override; + /// Warn if RegIndex is the same as the current AT. void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc); @@ -2307,7 +2319,41 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, bool FillDelaySlot = MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder(); - if (FillDelaySlot) + + // Get previous instruction`s forbidden slot attribute and + // whether set reorder. + bool PrevForbiddenSlotAttr = CurForbiddenSlotAttr; + + // Flag represents we set reorder after nop. + bool SetReorderAfterNop = false; + + // If previous instruction has forbidden slot and .set reorder + // is active and current instruction is CTI. + // Then emit a NOP after it. + if (PrevForbiddenSlotAttr && !SafeInForbiddenSlot(MCID)) { + TOut.emitEmptyDelaySlot(false, IDLoc, STI); + // When 'FillDelaySlot' is true, the existing logic will add + // noreorder before instruction and reorder after it. So there + // need exclude this case avoiding two '.set reorder'. + // The format of the first case is: + // .set noreorder + // bnezc + // nop + // .set reorder + if (AssemblerOptions.back()->isReorder() && !FillDelaySlot) { + SetReorderAfterNop = true; + TOut.emitDirectiveSetReorder(); + } + } + + // Save current instruction`s forbidden slot and whether set reorder. + // This is the judgment condition for whether to add nop. + // We would add a couple of '.set noreorder' and '.set reorder' to + // wrap the current instruction and the next instruction. + CurForbiddenSlotAttr = + hasForbiddenSlot(MCID) && AssemblerOptions.back()->isReorder(); + + if (FillDelaySlot || CurForbiddenSlotAttr) TOut.emitDirectiveSetNoReorder(); MacroExpanderResultTy ExpandResult = @@ -2322,6 +2368,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, return true; } + // When current instruction was not CTI, recover reorder state. + // The format of the second case is: + // .set noreoder + // bnezc + // add + // .set reorder + if (PrevForbiddenSlotAttr && !SetReorderAfterNop && !FillDelaySlot && + AssemblerOptions.back()->isReorder()) { + TOut.emitDirectiveSetReorder(); + } + // We know we emitted an instruction on the MER_NotAMacro or MER_Success path. // If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS. if (inMicroMipsMode()) { @@ -2331,6 +2388,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, // If this instruction has a delay slot and .set reorder is active, // emit a NOP after it. + // The format of the third case is: + // .set noreorder + // bnezc + // nop + // .set noreorder + // j + // nop + // .set reorder if (FillDelaySlot) { TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst), IDLoc, STI); TOut.emitDirectiveSetReorder(); @@ -2356,6 +2421,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, return false; } +void MipsAsmParser::onEndOfFile() { + MipsTargetStreamer &TOut = getTargetStreamer(); + SMLoc IDLoc = SMLoc(); + // If has pending forbidden slot, fill nop and recover reorder. + if (CurForbiddenSlotAttr) { + TOut.emitEmptyDelaySlot(false, IDLoc, STI); + if (AssemblerOptions.back()->isReorder()) + TOut.emitDirectiveSetReorder(); + } +} + MipsAsmParser::MacroExpanderResultTy MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { @@ -2920,6 +2996,11 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, (Res.getSymA()->getSymbol().isELF() && cast(Res.getSymA()->getSymbol()).getBinding() == ELF::STB_LOCAL); + // For O32, "$"-prefixed symbols are recognized as temporary while + // .L-prefixed symbols are not (PrivateGlobalPrefix is "$"). Recognize ".L" + // manually. + if (ABI.IsO32() && Res.getSymA()->getSymbol().getName().starts_with(".L")) + IsLocalSym = true; bool UseXGOT = STI->hasFeature(Mips::FeatureXGOT) && !IsLocalSym; // The case where the result register is $25 is somewhat special. If the @@ -6359,7 +6440,7 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { return true; SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); - MCSymbol *Sym = getContext().getOrCreateSymbol("$" + Identifier); + MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); // Otherwise create a symbol reference. const MCExpr *SymRef = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 27d7f0f261d100..adfcea73615831 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -1255,7 +1255,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI); } - if (getABI().IsN32()) { +#if 0 + // We haven't support -mabicalls -mno-shared yet. + if (-mno-shared) { MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp"); const MipsMCExpr *HiExpr = MipsMCExpr::create( MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()), @@ -1273,6 +1275,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, return; } +#endif const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff( MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()), @@ -1288,8 +1291,11 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(), &STI); - // daddu $gp, $gp, $funcreg - emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI); + // (d)addu $gp, $gp, $funcreg + if (getABI().IsN32()) + emitRRR(Mips::ADDu, GPReg, GPReg, RegNo, SMLoc(), &STI); + else + emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI); } void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation, diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td index 854563ab32bd8e..3ef04e488f016f 100644 --- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td @@ -152,15 +152,15 @@ class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>; class LWPC_ENC : PCREL19_FM; -class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>; -class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>; +class MAX_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>; +class MAX_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>; class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>; class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>; class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>; class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>; -class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>; -class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>; +class MINA_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>; +class MINA_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>; class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>; class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>; diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 718844bc36ff93..66b2b0de8d52a3 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -471,45 +471,6 @@ void MipsAsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) { TS.emitDirectiveInsn(); } -/// isBlockOnlyReachableByFallthough - Return true if the basic block has -/// exactly one predecessor and the control transfer mechanism between -/// the predecessor and this block is a fall-through. -bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* - MBB) const { - // The predecessor has to be immediately before this block. - const MachineBasicBlock *Pred = *MBB->pred_begin(); - - // If the predecessor is a switch statement, assume a jump table - // implementation, so it is not a fall through. - if (const BasicBlock *bb = Pred->getBasicBlock()) - if (isa(bb->getTerminator())) - return false; - - // If this is a landing pad, it isn't a fall through. If it has no preds, - // then nothing falls through to it. - if (MBB->isEHPad() || MBB->pred_empty()) - return false; - - // If there isn't exactly one predecessor, it can't be a fall through. - if (MBB->pred_size() != 1) - return false; - - // The predecessor has to be immediately before this block. - if (!Pred->isLayoutSuccessor(MBB)) - return false; - - // If the block is completely empty, then it definitely does fall through. - if (Pred->empty()) - return true; - - // Otherwise, check the last instruction. - // Check if the last terminator is an unconditional branch. - MachineBasicBlock::const_iterator I = Pred->end(); - while (I != Pred->begin() && !(--I)->isTerminator()) ; - - return !I->isBarrier(); -} - // Print out an operand for an inline asm expression. bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, const char *ExtraCode, raw_ostream &O) { diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h index 64424b181504a7..0b55089385d79d 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.h +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h @@ -142,8 +142,6 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter { void emitFunctionBodyStart() override; void emitFunctionBodyEnd() override; void emitBasicBlockEnd(const MachineBasicBlock &MBB) override; - bool isBlockOnlyReachableByFallthrough( - const MachineBasicBlock* MBB) const override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index d431d3d91494f6..88b226eaaccfab 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -4128,14 +4128,18 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'd': // Address register. Same as 'r' unless generating MIPS16 code. case 'y': // Same as 'r'. Exists for compatibility. case 'r': - if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || VT == MVT::i1) { + if ((VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || + VT == MVT::i1) || + (VT == MVT::f32 && Subtarget.useSoftFloat())) { if (Subtarget.inMips16Mode()) return std::make_pair(0U, &Mips::CPU16RegsRegClass); return std::make_pair(0U, &Mips::GPR32RegClass); } - if (VT == MVT::i64 && !Subtarget.isGP64bit()) + if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) && + !Subtarget.isGP64bit()) return std::make_pair(0U, &Mips::GPR32RegClass); - if (VT == MVT::i64 && Subtarget.isGP64bit()) + if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) && + Subtarget.isGP64bit()) return std::make_pair(0U, &Mips::GPR64RegClass); // This will generate an error message return std::make_pair(0U, nullptr); diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp index aee57a5075ff71..b43eee8fdd8c0f 100644 --- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp @@ -208,8 +208,10 @@ bool PPCExpandAtomicPseudo::expandAtomicRMW128( .addMBB(LoopMBB); CurrentMBB->addSuccessor(LoopMBB); CurrentMBB->addSuccessor(ExitMBB); - recomputeLiveIns(*LoopMBB); - recomputeLiveIns(*ExitMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB); + } while (anyChange); NMBBI = MBB.end(); MI.eraseFromParent(); return true; @@ -286,9 +288,11 @@ bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128( CurrentMBB->addSuccessor(LoopCmpMBB); CurrentMBB->addSuccessor(ExitMBB); - recomputeLiveIns(*LoopCmpMBB); - recomputeLiveIns(*CmpSuccMBB); - recomputeLiveIns(*ExitMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*CmpSuccMBB) || + recomputeLiveIns(*LoopCmpMBB); + } while (anyChange); NMBBI = MBB.end(); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 245e78641ed654..424501c35c043c 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1191,12 +1191,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) continue; - // For SVR4, don't emit a move for the CR spill slot if we haven't - // spilled CRs. - if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4) - && !MustSaveCR) - continue; - // For 64-bit SVR4 when we have spilled CRs, the spill location // is SP+8, not a frame-relative slot. if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) { @@ -1441,8 +1435,11 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB); } // Update liveins. - recomputeLiveIns(*ProbeLoopBodyMBB); - recomputeLiveIns(*ProbeExitMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*ProbeExitMBB) || + recomputeLiveIns(*ProbeLoopBodyMBB); + } while (anyChange); return ProbeExitMBB; }; // For case HasBP && MaxAlign > 1, we have to realign the SP by performing @@ -1534,8 +1531,10 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg); } // Update liveins. - recomputeLiveIns(*LoopMBB); - recomputeLiveIns(*ExitMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB); + } while (anyChange); } } ++NumPrologProbed; diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 26ed74108ec36c..18a4223d481ef0 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1635,7 +1635,8 @@ class BitPermutationSelector { default: break; case ISD::ROTL: if (isa(V.getOperand(1))) { - unsigned RotAmt = V.getConstantOperandVal(1); + assert(isPowerOf2_32(NumBits) && "rotl bits should be power of 2!"); + unsigned RotAmt = V.getConstantOperandVal(1) & (NumBits - 1); const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second; @@ -1648,15 +1649,20 @@ class BitPermutationSelector { case ISD::SHL: case PPCISD::SHL: if (isa(V.getOperand(1))) { - unsigned ShiftAmt = V.getConstantOperandVal(1); + // sld takes 7 bits, slw takes 6. + unsigned ShiftAmt = V.getConstantOperandVal(1) & ((NumBits << 1) - 1); const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second; - for (unsigned i = ShiftAmt; i < NumBits; ++i) - Bits[i] = LHSBits[i - ShiftAmt]; - - for (unsigned i = 0; i < ShiftAmt; ++i) - Bits[i] = ValueBit(ValueBit::ConstZero); + if (ShiftAmt >= NumBits) { + for (unsigned i = 0; i < NumBits; ++i) + Bits[i] = ValueBit(ValueBit::ConstZero); + } else { + for (unsigned i = ShiftAmt; i < NumBits; ++i) + Bits[i] = LHSBits[i - ShiftAmt]; + for (unsigned i = 0; i < ShiftAmt; ++i) + Bits[i] = ValueBit(ValueBit::ConstZero); + } return std::make_pair(Interesting = true, &Bits); } @@ -1664,15 +1670,20 @@ class BitPermutationSelector { case ISD::SRL: case PPCISD::SRL: if (isa(V.getOperand(1))) { - unsigned ShiftAmt = V.getConstantOperandVal(1); + // srd takes lowest 7 bits, srw takes 6. + unsigned ShiftAmt = V.getConstantOperandVal(1) & ((NumBits << 1) - 1); const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second; - for (unsigned i = 0; i < NumBits - ShiftAmt; ++i) - Bits[i] = LHSBits[i + ShiftAmt]; - - for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i) - Bits[i] = ValueBit(ValueBit::ConstZero); + if (ShiftAmt >= NumBits) { + for (unsigned i = 0; i < NumBits; ++i) + Bits[i] = ValueBit(ValueBit::ConstZero); + } else { + for (unsigned i = 0; i < NumBits - ShiftAmt; ++i) + Bits[i] = LHSBits[i + ShiftAmt]; + for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i) + Bits[i] = ValueBit(ValueBit::ConstZero); + } return std::make_pair(Interesting = true, &Bits); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index aec58d1c0dcb9f..85f1e670045b92 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14942,6 +14942,7 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i8, LDN->getMemOperand()); + DAG.makeEquivalentMemoryOrdering(LDN, Ld); // For signed conversion, we need to sign-extend the value in the VSR if (Signed) { diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index a0c3345ec1bbd7..ac88cd49db4e4b 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -5,6 +5,7 @@ set(LLVM_TARGET_DEFINITIONS RISCV.td) tablegen(LLVM RISCVGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter) +tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred) tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel) tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info) @@ -43,7 +44,6 @@ add_llvm_target(RISCVCodeGen RISCVISelDAGToDAG.cpp RISCVISelLowering.cpp RISCVMachineFunctionInfo.cpp - RISCVMacroFusion.cpp RISCVMergeBaseOffset.cpp RISCVOptWInstrs.cpp RISCVPostRAExpandPseudoInsts.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index e6e879282241dd..27d52c16a4f39d 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -30,6 +30,12 @@ include "RISCVCallingConv.td" include "RISCVInstrInfo.td" include "GISel/RISCVRegisterBanks.td" +//===----------------------------------------------------------------------===// +// RISC-V macro fusions. +//===----------------------------------------------------------------------===// + +include "RISCVMacroFusion.td" + //===----------------------------------------------------------------------===// // RISC-V Scheduling Models //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 3878be680c0492..26451c80f57b42 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -72,7 +72,7 @@ def FeatureStdExtZicntr [FeatureStdExtZicsr]>; def FeatureStdExtZicond - : SubtargetFeature<"experimental-zicond", "HasStdExtZicond", "true", + : SubtargetFeature<"zicond", "HasStdExtZicond", "true", "'Zicond' (Integer Conditional Operations)">; def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">, AssemblerPredicate<(all_of FeatureStdExtZicond), @@ -1044,30 +1044,6 @@ def TuneDLenFactor2 : SubtargetFeature<"dlen-factor-2", "DLenFactor2", "true", "Vector unit DLEN(data path width) is half of VLEN">; -def TuneLUIADDIFusion - : SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion", - "true", "Enable LUI+ADDI macrofusion">; - -def TuneAUIPCADDIFusion - : SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion", - "true", "Enable AUIPC+ADDI macrofusion">; - -def TuneZExtHFusion - : SubtargetFeature<"zexth-fusion", "HasZExtHFusion", - "true", "Enable SLLI+SRLI to be fused to zero extension of halfword">; - -def TuneZExtWFusion - : SubtargetFeature<"zextw-fusion", "HasZExtWFusion", - "true", "Enable SLLI+SRLI to be fused to zero extension of word">; - -def TuneShiftedZExtWFusion - : SubtargetFeature<"shifted-zextw-fusion", "HasShiftedZExtWFusion", - "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension of word">; - -def TuneLDADDFusion - : SubtargetFeature<"ld-add-fusion", "HasLDADDFusion", - "true", "Enable LD+ADD macrofusion.">; - def TuneNoDefaultUnroll : SubtargetFeature<"no-default-unroll", "EnableDefaultUnroll", "false", "Disable default unroll preference.">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 47c6cd6e5487b8..d46093b9e260a2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3192,7 +3192,8 @@ static std::optional getExactInteger(const APFloat &APF, // Note that this method will also match potentially unappealing index // sequences, like , however it is left to the caller to // determine whether this is worth generating code for. -static std::optional isSimpleVIDSequence(SDValue Op) { +static std::optional isSimpleVIDSequence(SDValue Op, + unsigned EltSizeInBits) { unsigned NumElts = Op.getNumOperands(); assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR"); bool IsInteger = Op.getValueType().isInteger(); @@ -3200,7 +3201,7 @@ static std::optional isSimpleVIDSequence(SDValue Op) { std::optional SeqStepDenom; std::optional SeqStepNum, SeqAddend; std::optional> PrevElt; - unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits(); + assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits()); for (unsigned Idx = 0; Idx < NumElts; Idx++) { // Assume undef elements match the sequence; we just have to be careful // when interpolating across them. @@ -3213,14 +3214,14 @@ static std::optional isSimpleVIDSequence(SDValue Op) { if (!isa(Op.getOperand(Idx))) return std::nullopt; Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes(EltSizeInBits); + maskTrailingOnes(Op.getScalarValueSizeInBits()); } else { // The BUILD_VECTOR must be all constants. if (!isa(Op.getOperand(Idx))) return std::nullopt; if (auto ExactInteger = getExactInteger( cast(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits)) + Op.getScalarValueSizeInBits())) Val = *ExactInteger; else return std::nullopt; @@ -3276,11 +3277,11 @@ static std::optional isSimpleVIDSequence(SDValue Op) { uint64_t Val; if (IsInteger) { Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes(EltSizeInBits); + maskTrailingOnes(Op.getScalarValueSizeInBits()); } else { Val = *getExactInteger( cast(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits); + Op.getScalarValueSizeInBits()); } uint64_t ExpectedVal = (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom; @@ -3550,7 +3551,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, // Try and match index sequences, which we can lower to the vid instruction // with optional modifications. An all-undef vector is matched by // getSplatValue, above. - if (auto SimpleVID = isSimpleVIDSequence(Op)) { + if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) { int64_t StepNumerator = SimpleVID->StepNumerator; unsigned StepDenominator = SimpleVID->StepDenominator; int64_t Addend = SimpleVID->Addend; @@ -4718,7 +4719,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, if (SrcVecIdx == -1) continue; unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts; - SDValue SrcVec = (unsigned)SrcVecIdx > VRegsPerSrc ? V2 : V1; + SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1; SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec, DAG.getVectorIdxConstant(ExtractIdx, DL)); SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget); @@ -5033,60 +5034,56 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, MVT IndexContainerVT = ContainerVT.changeVectorElementType(IndexVT.getScalarType()); - // Base case for the recursion just below - handle the worst case - // single source permutation. Note that all the splat variants - // are handled above. - if (V2.isUndef()) { + SDValue Gather; + // TODO: This doesn't trigger for i64 vectors on RV32, since there we + // encounter a bitcasted BUILD_VECTOR with low/high i32 values. + if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) { + Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG, + Subtarget); + } else { V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); - SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); - LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG, - Subtarget); - SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices, - DAG.getUNDEF(ContainerVT), TrueMask, VL); - return convertFromScalableVector(VT, Gather, DAG, Subtarget); - } - - // Translate the gather index we computed above (and possibly swapped) - // back to a shuffle mask. This step should disappear once we complete - // the migration to recursive design. - SmallVector ShuffleMaskLHS; - ShuffleMaskLHS.reserve(GatherIndicesLHS.size()); - for (SDValue GatherIndex : GatherIndicesLHS) { - if (GatherIndex.isUndef()) { - ShuffleMaskLHS.push_back(-1); - continue; + // If only one index is used, we can use a "splat" vrgather. + // TODO: We can splat the most-common index and fix-up any stragglers, if + // that's beneficial. + if (LHSIndexCounts.size() == 1) { + int SplatIndex = LHSIndexCounts.begin()->getFirst(); + Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1, + DAG.getConstant(SplatIndex, DL, XLenVT), + DAG.getUNDEF(ContainerVT), TrueMask, VL); + } else { + SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); + LHSIndices = + convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget); + + Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices, + DAG.getUNDEF(ContainerVT), TrueMask, VL); } - auto *IdxC = cast(GatherIndex); - ShuffleMaskLHS.push_back(IdxC->getZExtValue()); } - // Recursively invoke lowering for the LHS as if there were no RHS. - // This allows us to leverage all of our single source permute tricks. - SDValue Gather = - DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS); - Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget); + // If a second vector operand is used by this shuffle, blend it in with an + // additional vrgather. + if (!V2.isUndef()) { + V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); - // Blend in second vector source with an additional vrgather. - V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); + MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1); + SelectMask = + convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget); - MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1); - SelectMask = - convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget); - - // If only one index is used, we can use a "splat" vrgather. - // TODO: We can splat the most-common index and fix-up any stragglers, if - // that's beneficial. - if (RHSIndexCounts.size() == 1) { - int SplatIndex = RHSIndexCounts.begin()->getFirst(); - Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2, - DAG.getConstant(SplatIndex, DL, XLenVT), Gather, - SelectMask, VL); - } else { - SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); - RHSIndices = - convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); - Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather, - SelectMask, VL); + // If only one index is used, we can use a "splat" vrgather. + // TODO: We can splat the most-common index and fix-up any stragglers, if + // that's beneficial. + if (RHSIndexCounts.size() == 1) { + int SplatIndex = RHSIndexCounts.begin()->getFirst(); + Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2, + DAG.getConstant(SplatIndex, DL, XLenVT), Gather, + SelectMask, VL); + } else { + SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); + RHSIndices = + convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); + Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather, + SelectMask, VL); + } } return convertFromScalableVector(VT, Gather, DAG, Subtarget); @@ -14562,7 +14559,7 @@ static SDValue tryFoldSelectIntoOp(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDLoc DL(N); SDValue OtherOp = TrueVal.getOperand(1 - OpToFold); - EVT OtherOpVT = OtherOp->getValueType(0); + EVT OtherOpVT = OtherOp.getValueType(); SDValue IdentityOperand = DAG.getNeutralElement(Opc, DL, OtherOpVT, N->getFlags()); if (!Commutative) @@ -14658,8 +14655,8 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG, ISD::CondCode CC = cast(Cond.getOperand(2))->get(); if (CC == ISD::SETEQ && LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isNullConstant(RHS)) { - uint64_t MaskVal = LHS.getConstantOperandVal(1); - if (isPowerOf2_64(MaskVal) && !isInt<12>(MaskVal)) + const APInt &MaskVal = LHS.getConstantOperandAPInt(1); + if (MaskVal.isPowerOf2() && !MaskVal.isSignedIntN(12)) return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CondVT, LHS, RHS, ISD::SETNE), False, True); @@ -15565,8 +15562,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, MGN->getMemOperand(), IndexType, MGN->getExtensionType()); if (Index.getOpcode() == ISD::BUILD_VECTOR && - MGN->getExtensionType() == ISD::NON_EXTLOAD) { - if (std::optional SimpleVID = isSimpleVIDSequence(Index); + MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) { + // The sequence will be XLenVT, not the type of Index. Tell + // isSimpleVIDSequence this so we avoid overflow. + if (std::optional SimpleVID = + isSimpleVIDSequence(Index, Subtarget.getXLen()); SimpleVID && SimpleVID->StepDenominator == 1) { const int64_t StepNumerator = SimpleVID->StepNumerator; const int64_t Addend = SimpleVID->Addend; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 592962cebe8973..d5b1ddfbeb3dc9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1229,7 +1229,8 @@ bool RISCVInstrInfo::optimizeCondBranch(MachineInstr &MI) const { MachineBasicBlock::reverse_iterator II(&MI), E = MBB->rend(); auto DefC1 = std::find_if(++II, E, [&](const MachineInstr &I) -> bool { int64_t Imm; - return isLoadImm(&I, Imm) && Imm == C1; + return isLoadImm(&I, Imm) && Imm == C1 && + I.getOperand(0).getReg().isVirtual(); }); if (DefC1 != E) return DefC1->getOperand(0).getReg(); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td index 0790a941823b1a..35d3fdae0bd79b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td @@ -8,8 +8,6 @@ // // This file describes the RISC-V instructions from the standard Integer // Conditional operations extension (Zicond). -// This version is still experimental as the 'Zicond' extension hasn't been -// ratified yet. It is based on v1.0-rc1 of the specification. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp deleted file mode 100644 index f948f05b22f772..00000000000000 --- a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp +++ /dev/null @@ -1,210 +0,0 @@ -//===- RISCVMacroFusion.cpp - RISC-V Macro Fusion -------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file This file contains the RISC-V implementation of the DAG scheduling -/// mutation to pair instructions back to back. -// -//===----------------------------------------------------------------------===// -// -#include "RISCVMacroFusion.h" -#include "RISCVSubtarget.h" -#include "llvm/CodeGen/MacroFusion.h" -#include "llvm/CodeGen/TargetInstrInfo.h" - -using namespace llvm; - -static bool checkRegisters(Register FirstDest, const MachineInstr &SecondMI) { - if (!SecondMI.getOperand(1).isReg()) - return false; - - if (SecondMI.getOperand(1).getReg() != FirstDest) - return false; - - // If the input is virtual make sure this is the only user. - if (FirstDest.isVirtual()) { - auto &MRI = SecondMI.getMF()->getRegInfo(); - return MRI.hasOneNonDBGUse(FirstDest); - } - - return SecondMI.getOperand(0).getReg() == FirstDest; -} - -// Fuse load with add: -// add rd, rs1, rs2 -// ld rd, 0(rd) -static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::LD) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - if (SecondMI.getOperand(2).getImm() != 0) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::ADD) - return true; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse zero extension of halfword: -// slli rd, rs1, 48 -// srli rd, rd, 48 -static bool isZExtH(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::SRLI) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - if (SecondMI.getOperand(2).getImm() != 48) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::SLLI) - return false; - - if (FirstMI->getOperand(2).getImm() != 48) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse zero extension of word: -// slli rd, rs1, 32 -// srli rd, rd, 32 -static bool isZExtW(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::SRLI) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - if (SecondMI.getOperand(2).getImm() != 32) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::SLLI) - return false; - - if (FirstMI->getOperand(2).getImm() != 32) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse shifted zero extension of word: -// slli rd, rs1, 32 -// srli rd, rd, x -// where 0 <= x < 32 -static bool isShiftedZExtW(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::SRLI) - return false; - - if (!SecondMI.getOperand(2).isImm()) - return false; - - unsigned SRLIImm = SecondMI.getOperand(2).getImm(); - if (SRLIImm >= 32) - return false; - - // Given SecondMI, when FirstMI is unspecified, we must return - // if SecondMI may be part of a fused pair at all. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::SLLI) - return false; - - if (FirstMI->getOperand(2).getImm() != 32) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse AUIPC followed by ADDI -// auipc rd, imm20 -// addi rd, rd, imm12 -static bool isAUIPCADDI(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::ADDI) - return false; - // Assume the 1st instr to be a wildcard if it is unspecified. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::AUIPC) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -// Fuse LUI followed by ADDI or ADDIW. -// rd = imm[31:0] which decomposes to -// lui rd, imm[31:12] -// addi(w) rd, rd, imm[11:0] -static bool isLUIADDI(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - if (SecondMI.getOpcode() != RISCV::ADDI && - SecondMI.getOpcode() != RISCV::ADDIW) - return false; - // Assume the 1st instr to be a wildcard if it is unspecified. - if (!FirstMI) - return true; - - if (FirstMI->getOpcode() != RISCV::LUI) - return false; - - return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); -} - -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, - const TargetSubtargetInfo &TSI, - const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - const RISCVSubtarget &ST = static_cast(TSI); - - if (ST.hasLUIADDIFusion() && isLUIADDI(FirstMI, SecondMI)) - return true; - - if (ST.hasAUIPCADDIFusion() && isAUIPCADDI(FirstMI, SecondMI)) - return true; - - if (ST.hasZExtHFusion() && isZExtH(FirstMI, SecondMI)) - return true; - - if (ST.hasZExtWFusion() && isZExtW(FirstMI, SecondMI)) - return true; - - if (ST.hasShiftedZExtWFusion() && isShiftedZExtW(FirstMI, SecondMI)) - return true; - - if (ST.hasLDADDFusion() && isLDADD(FirstMI, SecondMI)) - return true; - - return false; -} - -std::unique_ptr llvm::createRISCVMacroFusionDAGMutation() { - return createMacroFusionDAGMutation(shouldScheduleAdjacent); -} diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.h b/llvm/lib/Target/RISCV/RISCVMacroFusion.h deleted file mode 100644 index 7598db3f8fe143..00000000000000 --- a/llvm/lib/Target/RISCV/RISCVMacroFusion.h +++ /dev/null @@ -1,28 +0,0 @@ -//===- RISCVMacroFusion.h - RISC-V Macro Fusion -----------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file This file contains the RISC-V definition of the DAG scheduling -/// mutation to pair instructions back to back. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H -#define LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H - -#include "llvm/CodeGen/MachineScheduler.h" - -namespace llvm { - -/// Note that you have to add: -/// DAG.addMutation(createRISCVMacroFusionDAGMutation()); -/// to RISCVPassConfig::createMachineScheduler() to have an effect. -std::unique_ptr createRISCVMacroFusionDAGMutation(); - -} // namespace llvm - -#endif diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.td b/llvm/lib/Target/RISCV/RISCVMacroFusion.td new file mode 100644 index 00000000000000..875a93d09a2c64 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.td @@ -0,0 +1,93 @@ +//==----- RISCVMacroFusion.td - Macro Fusion Definitions -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the macro fusion predicators. + +// Fuse LUI followed by ADDI or ADDIW: +// rd = imm[31:0] which decomposes to +// lui rd, imm[31:12] +// addi(w) rd, rd, imm[11:0] +def TuneLUIADDIFusion + : SimpleFusion<"lui-addi-fusion", "HasLUIADDIFusion", + "Enable LUI+ADDI macro fusion", + CheckOpcode<[LUI]>, + CheckOpcode<[ADDI, ADDIW]>>; + +// Fuse AUIPC followed by ADDI: +// auipc rd, imm20 +// addi rd, rd, imm12 +def TuneAUIPCADDIFusion + : SimpleFusion<"auipc-addi-fusion", "HasAUIPCADDIFusion", + "Enable AUIPC+ADDI macrofusion", + CheckOpcode<[AUIPC]>, + CheckOpcode<[ADDI]>>; + +// Fuse zero extension of halfword: +// slli rd, rs1, 48 +// srli rd, rd, 48 +def TuneZExtHFusion + : SimpleFusion<"zexth-fusion", "HasZExtHFusion", + "Enable SLLI+SRLI to be fused to zero extension of halfword", + CheckAll<[ + CheckOpcode<[SLLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 48> + ]>, + CheckAll<[ + CheckOpcode<[SRLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 48> + ]>>; + +// Fuse zero extension of word: +// slli rd, rs1, 32 +// srli rd, rd, 32 +def TuneZExtWFusion + : SimpleFusion<"zextw-fusion", "HasZExtWFusion", + "Enable SLLI+SRLI to be fused to zero extension of word", + CheckAll<[ + CheckOpcode<[SLLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 32> + ]>, + CheckAll<[ + CheckOpcode<[SRLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 32> + ]>>; + +// Fuse shifted zero extension of word: +// slli rd, rs1, 32 +// srli rd, rd, x +// where 0 <= x < 32 +def TuneShiftedZExtWFusion + : SimpleFusion<"shifted-zextw-fusion", "HasShiftedZExtWFusion", + "Enable SLLI+SRLI to be fused when computing (shifted) word zero extension", + CheckAll<[ + CheckOpcode<[SLLI]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 32> + ]>, + CheckAll<[ + CheckOpcode<[SRLI]>, + CheckIsImmOperand<2>, + CheckImmOperandRange<2, 0, 31> + ]>>; + +// Fuse load with add: +// add rd, rs1, rs2 +// ld rd, 0(rd) +def TuneLDADDFusion + : SimpleFusion<"ld-add-fusion", "HasLDADDFusion", "Enable LD+ADD macrofusion", + CheckOpcode<[ADD]>, + CheckAll<[ + CheckOpcode<[LD]>, + CheckIsImmOperand<2>, + CheckImmOperand<2, 0> + ]>>; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 7b64d3cee9c800..d3236bb07d56d5 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -16,8 +16,9 @@ #include "GISel/RISCVRegisterBankInfo.h" #include "RISCV.h" #include "RISCVFrameLowering.h" -#include "RISCVMacroFusion.h" #include "RISCVTargetMachine.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" @@ -29,6 +30,9 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "RISCVGenSubtargetInfo.inc" +#define GET_RISCV_MACRO_FUSION_PRED_IMPL +#include "RISCVGenMacroFusion.inc" + namespace llvm::RISCVTuneInfoTable { #define GET_RISCVTuneInfoTable_IMPL @@ -187,7 +191,7 @@ bool RISCVSubtarget::enableSubRegLiveness() const { void RISCVSubtarget::getPostRAMutations( std::vector> &Mutations) const { - Mutations.push_back(createRISCVMacroFusionDAGMutation()); + Mutations.push_back(createMacroFusionDAGMutation(getMacroFusions())); } /// Enable use of alias analysis during code generation (during MI diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 2ba93764facd07..8c55efa69a6a5f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -27,6 +27,9 @@ #include "llvm/Target/TargetMachine.h" #include +#define GET_RISCV_MACRO_FUSION_PRED_DECL +#include "RISCVGenMacroFusion.inc" + #define GET_SUBTARGETINFO_HEADER #include "RISCVGenSubtargetInfo.inc" @@ -196,11 +199,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { return UserReservedRegister[i]; } - bool hasMacroFusion() const { - return hasLUIADDIFusion() || hasAUIPCADDIFusion() || hasZExtHFusion() || - hasZExtWFusion() || hasShiftedZExtWFusion() || hasLDADDFusion(); - } - // Vector codegen related methods. bool hasVInstructions() const { return HasStdExtZve32x; } bool hasVInstructionsI64() const { return HasStdExtZve64x; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index b4b81b545a54bb..2285c99d790100 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -14,7 +14,6 @@ #include "MCTargetDesc/RISCVBaseInfo.h" #include "RISCV.h" #include "RISCVMachineFunctionInfo.h" -#include "RISCVMacroFusion.h" #include "RISCVTargetObjectFile.h" #include "RISCVTargetTransformInfo.h" #include "TargetInfo/RISCVTargetInfo.h" @@ -26,6 +25,8 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MIRYamlMapping.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MacroFusion.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -361,9 +362,10 @@ class RISCVPassConfig : public TargetPassConfig { DAG->addMutation(createLoadClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); } - if (ST.hasMacroFusion()) { + const auto &MacroFusions = ST.getMacroFusions(); + if (!MacroFusions.empty()) { DAG = DAG ? DAG : createGenericSchedLive(C); - DAG->addMutation(createRISCVMacroFusionDAGMutation()); + DAG->addMutation(createMacroFusionDAGMutation(MacroFusions)); } return DAG; } @@ -371,9 +373,10 @@ class RISCVPassConfig : public TargetPassConfig { ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { const RISCVSubtarget &ST = C->MF->getSubtarget(); - if (ST.hasMacroFusion()) { + const auto &MacroFusions = ST.getMacroFusions(); + if (!MacroFusions.empty()) { ScheduleDAGMI *DAG = createGenericSchedPostRA(C); - DAG->addMutation(createRISCVMacroFusionDAGMutation()); + DAG->addMutation(createMacroFusionDAGMutation(MacroFusions)); return DAG; } return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 866d5cf340e68b..66dab70d455ff4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -37,6 +37,9 @@ static cl::opt SLPMaxVF( InstructionCost RISCVTTIImpl::getRISCVInstructionCost(ArrayRef OpCodes, MVT VT, TTI::TargetCostKind CostKind) { + // Check if the type is valid for all CostKind + if (!VT.isVector()) + return InstructionCost::getInvalid(); size_t NumInstr = OpCodes.size(); if (CostKind == TTI::TCK_CodeSize) return NumInstr; diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td index 7b103395652433..38a59e650f33c7 100644 --- a/llvm/lib/Target/Sparc/Sparc.td +++ b/llvm/lib/Target/Sparc/Sparc.td @@ -72,6 +72,20 @@ def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true", //==== Features added predmoninantly for LEON subtarget support include "LeonFeatures.td" +//==== Register allocation tweaks needed by some low-level software +foreach i = 1 ... 7 in + def FeatureReserveG#i : SubtargetFeature<"reserve-g"#i, "ReserveRegister["#i#" + SP::G0]", "true", + "Reserve G"#i#", making it unavailable as a GPR">; +foreach i = 0 ... 5 in + def FeatureReserveO#i : SubtargetFeature<"reserve-o"#i, "ReserveRegister["#i#" + SP::O0]", "true", + "Reserve O"#i#", making it unavailable as a GPR">; +foreach i = 0 ... 7 in + def FeatureReserveL#i : SubtargetFeature<"reserve-l"#i, "ReserveRegister["#i#" + SP::L0]", "true", + "Reserve L"#i#", making it unavailable as a GPR">; +foreach i = 0 ... 5 in + def FeatureReserveI#i : SubtargetFeature<"reserve-i"#i, "ReserveRegister["#i#" + SP::I0]", "true", + "Reserve I"#i#", making it unavailable as a GPR">; + //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index 215a8ea8319046..6855471840e9db 100644 --- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -434,6 +434,50 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, default: // See if this is a generic print operand return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); + case 'L': // Low order register of a twin word register operand + case 'H': // High order register of a twin word register operand + { + const SparcSubtarget &Subtarget = MF->getSubtarget(); + const MachineOperand &MO = MI->getOperand(OpNo); + const SparcRegisterInfo *RegisterInfo = Subtarget.getRegisterInfo(); + Register MOReg = MO.getReg(); + + Register HiReg, LoReg; + if (!SP::IntPairRegClass.contains(MOReg)) { + // If we aren't given a register pair already, find out which pair it + // belongs to. Note that here, the specified register operand, which + // refers to the high part of the twinword, needs to be an even-numbered + // register. + MOReg = RegisterInfo->getMatchingSuperReg(MOReg, SP::sub_even, + &SP::IntPairRegClass); + if (!MOReg) { + SMLoc Loc; + OutContext.reportError( + Loc, "Hi part of pair should point to an even-numbered register"); + OutContext.reportError( + Loc, "(note that in some cases it might be necessary to manually " + "bind the input/output registers instead of relying on " + "automatic allocation)"); + return true; + } + } + + HiReg = RegisterInfo->getSubReg(MOReg, SP::sub_even); + LoReg = RegisterInfo->getSubReg(MOReg, SP::sub_odd); + + Register Reg; + switch (ExtraCode[0]) { + case 'L': + Reg = LoReg; + break; + case 'H': + Reg = HiReg; + break; + } + + O << '%' << SparcInstPrinter::getRegisterName(Reg); + return false; + } case 'f': case 'r': break; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 78bdf3ae9a84ba..bdefb0841a124b 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -13,6 +13,7 @@ #include "SparcISelLowering.h" #include "MCTargetDesc/SparcMCExpr.h" +#include "MCTargetDesc/SparcMCTargetDesc.h" #include "SparcMachineFunctionInfo.h" #include "SparcRegisterInfo.h" #include "SparcTargetMachine.h" @@ -28,6 +29,7 @@ #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" @@ -729,6 +731,30 @@ SDValue SparcTargetLowering::LowerFormalArguments_64( return Chain; } +// Check whether any of the argument registers are reserved +static bool isAnyArgRegReserved(const SparcRegisterInfo *TRI, + const MachineFunction &MF) { + // The register window design means that outgoing parameters at O* + // will appear in the callee as I*. + // Be conservative and check both sides of the register names. + bool Outgoing = + llvm::any_of(SP::GPROutgoingArgRegClass, [TRI, &MF](MCPhysReg r) { + return TRI->isReservedReg(MF, r); + }); + bool Incoming = + llvm::any_of(SP::GPRIncomingArgRegClass, [TRI, &MF](MCPhysReg r) { + return TRI->isReservedReg(MF, r); + }); + return Outgoing || Incoming; +} + +static void emitReservedArgRegCallError(const MachineFunction &MF) { + const Function &F = MF.getFunction(); + F.getContext().diagnose(DiagnosticInfoUnsupported{ + F, ("SPARC doesn't support" + " function calls if any of the argument registers is reserved.")}); +} + SDValue SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -805,6 +831,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; + MachineFunction &MF = DAG.getMachineFunction(); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -1055,6 +1082,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CallConv) : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv)); + + if (isAnyArgRegReserved(TRI, MF)) + emitReservedArgRegCallError(MF); + assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1125,6 +1156,13 @@ Register SparcTargetLowering::getRegisterByName(const char* RegName, LLT VT, .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7) .Default(0); + // If we're directly referencing register names + // (e.g in GCC C extension `register int r asm("g1");`), + // make sure that said register is in the reserve list. + const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (!TRI->isReservedReg(MF, Reg)) + Reg = 0; + if (Reg) return Reg; @@ -1189,6 +1227,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, SDLoc DL = CLI.DL; SDValue Chain = CLI.Chain; auto PtrVT = getPointerTy(DAG.getDataLayout()); + MachineFunction &MF = DAG.getMachineFunction(); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -1372,6 +1411,10 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv) : TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv)); + + if (isAnyArgRegReserved(TRI, MF)) + emitReservedArgRegCallError(MF); + assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp index f97bf57627d1aa..71a27f77d2c6bf 100644 --- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -12,10 +12,8 @@ #include "SparcRegisterInfo.h" #include "Sparc.h" -#include "SparcMachineFunctionInfo.h" #include "SparcSubtarget.h" #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -98,9 +96,21 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned n = 0; n < 31; n++) Reserved.set(SP::ASR1 + n); + for (TargetRegisterClass::iterator i = SP::IntRegsRegClass.begin(); + i != SP::IntRegsRegClass.end(); ++i) { + if (MF.getSubtarget().isRegisterReserved(*i)) + markSuperRegs(Reserved, *i); + } + + assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } +bool SparcRegisterInfo::isReservedReg(const MachineFunction &MF, + MCRegister Reg) const { + return getReservedRegs(MF)[Reg]; +} + const TargetRegisterClass* SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/llvm/lib/Target/Sparc/SparcRegisterInfo.h index 5b3c1a7ad07dd5..58c85f33635f2d 100644 --- a/llvm/lib/Target/Sparc/SparcRegisterInfo.h +++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.h @@ -30,6 +30,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo { const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const; BitVector getReservedRegs(const MachineFunction &MF) const override; + bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const; const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF, unsigned Kind) const override; diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/llvm/lib/Target/Sparc/SparcRegisterInfo.td index d5ba7464695c5f..d8319a8d41dda0 100644 --- a/llvm/lib/Target/Sparc/SparcRegisterInfo.td +++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.td @@ -370,6 +370,10 @@ def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>; // Floating point control register classes. def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>; +// GPR argument registers. +def GPROutgoingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "O%u", 0, 5)>; +def GPRIncomingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "I%u", 0, 5)>; + let isAllocatable = 0 in { // Ancillary state registers // FIXME: TICK is special-cased here as it can be accessed diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp index 6b09904ca5e8e5..5b65e34e0f8a36 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp +++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp @@ -50,6 +50,7 @@ SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU, const StringRef &FS, const TargetMachine &TM, bool is64Bit) : SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS), + ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()), TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit), InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)), TLInfo(TM, *this), FrameLowering(*this) {} diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h index cdb210f67482c4..fe4aca5195306a 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -13,12 +13,14 @@ #ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H #define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H +#include "MCTargetDesc/SparcMCTargetDesc.h" #include "SparcFrameLowering.h" #include "SparcISelLowering.h" #include "SparcInstrInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" #include @@ -29,6 +31,10 @@ namespace llvm { class StringRef; class SparcSubtarget : public SparcGenSubtargetInfo { + // ReserveRegister[i] - Register #i is not available as a general purpose + // register. + BitVector ReserveRegister; + Triple TargetTriple; virtual void anchor(); @@ -82,6 +88,10 @@ class SparcSubtarget : public SparcGenSubtargetInfo { return is64Bit() ? 2047 : 0; } + bool isRegisterReserved(MCPhysReg PhysReg) const { + return ReserveRegister[PhysReg]; + } + /// Given a actual stack size as determined by FrameInfo, this function /// returns adjusted framesize which includes space for register window /// spills and arguments. diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index db19c8881c685a..80c994a32ea96a 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -840,8 +840,10 @@ void SystemZELFFrameLowering::inlineStackProbe( StackAllocMI->eraseFromParent(); if (DoneMBB != nullptr) { // Compute the live-in lists for the new blocks. - recomputeLiveIns(*DoneMBB); - recomputeLiveIns(*LoopMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*DoneMBB) || recomputeLiveIns(*LoopMBB); + } while (anyChange); } } @@ -1439,8 +1441,10 @@ void SystemZXPLINKFrameLowering::inlineStackProbe( StackAllocMI->eraseFromParent(); // Compute the live-in lists for the new blocks. - recomputeLiveIns(*NextMBB); - recomputeLiveIns(*StackExtMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*StackExtMBB) || recomputeLiveIns(*NextMBB); + } while (anyChange); } bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 924df12578fe4b..5e0b0594b0a421 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1067,7 +1067,8 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL, if (!isInt<20>(AM.BaseOffs)) return false; - bool RequireD12 = Subtarget.hasVector() && Ty->isVectorTy(); + bool RequireD12 = + Subtarget.hasVector() && (Ty->isVectorTy() || Ty->isIntegerTy(128)); AddressingMode SupportedAM(!RequireD12, true); if (I != nullptr) SupportedAM = supportedAddressingMode(I, Subtarget.hasVector()); @@ -1922,7 +1923,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned N = getNumRegistersForCallingConv(Ctx, CLI.CallConv, OrigArgVT); SlotVT = EVT::getIntegerVT(Ctx, PartVT.getSizeInBits() * N); } else { - SlotVT = Outs[I].ArgVT; + SlotVT = Outs[I].VT; } SDValue SpillSlot = DAG.CreateStackTemporary(SlotVT); int FI = cast(SpillSlot)->getIndex(); @@ -4251,6 +4252,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op, if (N->getValueType(0) == MVT::i128) { unsigned BaseOp = 0; unsigned FlagOp = 0; + bool IsBorrow = false; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown instruction!"); case ISD::UADDO: @@ -4260,6 +4262,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op, case ISD::USUBO: BaseOp = ISD::SUB; FlagOp = SystemZISD::VSCBI; + IsBorrow = true; break; } SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS); @@ -4267,6 +4270,9 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op, Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag, DAG.getValueType(MVT::i1)); Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1)); + if (IsBorrow) + Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(), + Flag, DAG.getConstant(1, DL, Flag.getValueType())); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag); } @@ -4339,6 +4345,7 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op, if (VT == MVT::i128) { unsigned BaseOp = 0; unsigned FlagOp = 0; + bool IsBorrow = false; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown instruction!"); case ISD::UADDO_CARRY: @@ -4348,14 +4355,21 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op, case ISD::USUBO_CARRY: BaseOp = SystemZISD::VSBI; FlagOp = SystemZISD::VSBCBI; + IsBorrow = true; break; } + if (IsBorrow) + Carry = DAG.getNode(ISD::XOR, DL, Carry.getValueType(), + Carry, DAG.getConstant(1, DL, Carry.getValueType())); Carry = DAG.getZExtOrTrunc(Carry, DL, MVT::i128); SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS, Carry); SDValue Flag = DAG.getNode(FlagOp, DL, MVT::i128, LHS, RHS, Carry); Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag, DAG.getValueType(MVT::i1)); Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1)); + if (IsBorrow) + Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(), + Flag, DAG.getConstant(1, DL, Flag.getValueType())); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag); } @@ -6610,6 +6624,27 @@ SDValue SystemZTargetLowering::combineZERO_EXTEND( return NewSelect; } } + // Convert (zext (xor (trunc X), C)) into (xor (trunc X), C') if the size + // of the result is smaller than the size of X and all the truncated bits + // of X are already zero. + if (N0.getOpcode() == ISD::XOR && + N0.hasOneUse() && N0.getOperand(0).hasOneUse() && + N0.getOperand(0).getOpcode() == ISD::TRUNCATE && + N0.getOperand(1).getOpcode() == ISD::Constant) { + SDValue X = N0.getOperand(0).getOperand(0); + if (VT.isScalarInteger() && VT.getSizeInBits() < X.getValueSizeInBits()) { + KnownBits Known = DAG.computeKnownBits(X); + APInt TruncatedBits = APInt::getBitsSet(X.getValueSizeInBits(), + N0.getValueSizeInBits(), + VT.getSizeInBits()); + if (TruncatedBits.isSubsetOf(Known.Zero)) { + X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); + APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); + return DAG.getNode(ISD::XOR, SDLoc(N0), VT, + X, DAG.getConstant(Mask, SDLoc(N0), VT)); + } + } + } return SDValue(); } diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index 9f0fd4d0938e97..87ec8aa23080e0 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -877,7 +877,6 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) { OutStreamer->emitInt32(FeatureFlagsAnd); // data emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding - OutStreamer->endSection(Nt); OutStreamer->switchSection(Cur); } } diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index c0d358ead2787b..c2f76a3b8abbea 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -885,8 +885,10 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( } // Update Live In information - recomputeLiveIns(*testMBB); - recomputeLiveIns(*tailMBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*tailMBB) || recomputeLiveIns(*testMBB); + } while (anyChange); } void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( @@ -1378,10 +1380,11 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, footMBB->addSuccessor(&MBB); } - recomputeLiveIns(*headMBB); - recomputeLiveIns(*bodyMBB); - recomputeLiveIns(*footMBB); - recomputeLiveIns(MBB); + bool anyChange = false; + do { + anyChange = recomputeLiveIns(*footMBB) || recomputeLiveIns(*bodyMBB) || + recomputeLiveIns(*headMBB) || recomputeLiveIns(MBB); + } while (anyChange); } } else { MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 833f058253d880..553d338b77904a 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2923,11 +2923,10 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, } bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { - // Cannot use 32 bit constants to reference objects in kernel code model. - // Cannot use 32 bit constants to reference objects in large PIC mode since - // GOTOFF is 64 bits. + // Cannot use 32 bit constants to reference objects in kernel/large code + // model. if (TM.getCodeModel() == CodeModel::Kernel || - (TM.getCodeModel() == CodeModel::Large && TM.isPositionIndependent())) + TM.getCodeModel() == CodeModel::Large) return false; // In static codegen with small code model, we can get the address of a label diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e158312caffdec..71fc6b5047eaa9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18703,16 +18703,18 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget.isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; - unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ? - X86ISD::WrapperRIP : X86ISD::Wrapper; + unsigned WrapperKind = 0; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); - if (PIC32) + if (PIC32) { OpFlag = X86II::MO_TLVP_PIC_BASE; - else + WrapperKind = X86ISD::Wrapper; + } else { OpFlag = X86II::MO_TLVP; + WrapperKind = X86ISD::WrapperRIP; + } SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), @@ -47033,10 +47035,13 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget)) return V; - // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) - // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or - // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) - // depending on sign of (SarConst - [56,48,32,24,16]) + // fold (SRA (SHL X, ShlConst), SraConst) + // into (SHL (sext_in_reg X), ShlConst - SraConst) + // or (sext_in_reg X) + // or (SRA (sext_in_reg X), SraConst - ShlConst) + // depending on relation between SraConst and ShlConst. + // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows + // us to do the sext_in_reg from corresponding bit. // sexts in X86 are MOVs. The MOVs have the same code size // as above SHIFTs (only SHIFT on 1 has lower code size). @@ -47052,29 +47057,29 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); APInt ShlConst = N01->getAsAPIntVal(); - APInt SarConst = N1->getAsAPIntVal(); + APInt SraConst = N1->getAsAPIntVal(); EVT CVT = N1.getValueType(); - if (SarConst.isNegative()) + if (CVT != N01.getValueType()) + return SDValue(); + if (SraConst.isNegative()) return SDValue(); for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { unsigned ShiftSize = SVT.getSizeInBits(); - // skipping types without corresponding sext/zext and - // ShlConst that is not one of [56,48,32,24,16] + // Only deal with (Size - ShlConst) being equal to 8, 16 or 32. if (ShiftSize >= Size || ShlConst != Size - ShiftSize) continue; SDLoc DL(N); SDValue NN = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); - SarConst = SarConst - (Size - ShiftSize); - if (SarConst == 0) + if (SraConst.eq(ShlConst)) return NN; - if (SarConst.isNegative()) + if (SraConst.ult(ShlConst)) return DAG.getNode(ISD::SHL, DL, VT, NN, - DAG.getConstant(-SarConst, DL, CVT)); + DAG.getConstant(ShlConst - SraConst, DL, CVT)); return DAG.getNode(ISD::SRA, DL, VT, NN, - DAG.getConstant(SarConst, DL, CVT)); + DAG.getConstant(SraConst - ShlConst, DL, CVT)); } return SDValue(); } @@ -47876,6 +47881,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, SDValue X, Y; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SDValue Not = GetNot(N0)) { X = Not; @@ -47889,9 +47895,11 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, X = DAG.getBitcast(VT, X); Y = DAG.getBitcast(VT, Y); SDLoc DL(N); + // We do not split for SSE at all, but we need to split vectors for AVX1 and // AVX2. - if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) { + if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && + TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) { SDValue LoX, HiX; std::tie(LoX, HiX) = splitVector(X, DAG, DL); SDValue LoY, HiY; @@ -47901,7 +47909,11 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY}); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV}); } - return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y}); + + if (TLI.isTypeLegal(VT)) + return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y}); + + return SDValue(); } // Try to widen AND, OR and XOR nodes to VT in order to remove casts around diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index fe7d90fbcdf707..bb5e22c7142793 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12422,7 +12422,7 @@ multiclass GF2P8AFFINE_avx512_rmb_imm Op, string OpStr, SDNode OpNode, : avx512_3Op_rm_imm8 { let ExeDomain = VTI.ExeDomain in defm rmbi : AVX512_maskablegetParent()->getFunction().hasOptSize()) { - unsigned Mask; - switch (Opc) { - default: - llvm_unreachable("Unreachable!"); - case X86::BLENDPDrri: - Opc = X86::MOVSDrr; - Mask = 0x03; - break; - case X86::BLENDPSrri: - Opc = X86::MOVSSrr; - Mask = 0x0F; - break; - case X86::VBLENDPDrri: - Opc = X86::VMOVSDrr; - Mask = 0x03; - break; - case X86::VBLENDPSrri: - Opc = X86::VMOVSSrr; - Mask = 0x0F; - break; - } + unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F; if ((MI.getOperand(3).getImm() ^ Mask) == 1) { +#define FROM_TO(FROM, TO) \ + case X86::FROM: \ + Opc = X86::TO; \ + break; + switch (Opc) { + default: + llvm_unreachable("Unreachable!"); + FROM_TO(BLENDPDrri, MOVSDrr) + FROM_TO(BLENDPSrri, MOVSSrr) + FROM_TO(VBLENDPDrri, VMOVSDrr) + FROM_TO(VBLENDPSrri, VMOVSSrr) + } WorkingMI = CloneIfNew(MI); WorkingMI->setDesc(get(Opc)); WorkingMI->removeOperand(3); break; } +#undef FROM_TO } [[fallthrough]]; case X86::PBLENDWrri: diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td index bbd19cf8d5b25e..461b2badc13134 100644 --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -83,6 +83,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // A 128-bit subvector extract from the first 512-bit vector position is a // subregister copy that needs no instruction. Likewise, a 128-bit subvector @@ -95,6 +96,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // A 128-bit subvector extract from the first 512-bit vector position is a // subregister copy that needs no instruction. Likewise, a 128-bit subvector @@ -107,6 +109,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // If we're inserting into an all zeros vector, just use a plain move which diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 58ebe023cd61ec..7ce0aa22b99795 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -959,8 +959,10 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, SmallString<256> Code; unsigned MinSize = MI.getOperand(0).getImm(); - if (NextMI != MI.getParent()->end()) { + if (NextMI != MI.getParent()->end() && !NextMI->isInlineAsm()) { // Lower the next MachineInstr to find its byte size. + // If the next instruction is inline assembly, we skip lowering it for now, + // and assume we should always generate NOPs. MCInst MCI; MCIL.Lower(&*NextMI, MCI); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index a458b5f9ec8fbb..4d55a084b730e4 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -244,7 +244,8 @@ class X86Subtarget final : public X86GenSubtargetInfo { // TODO: Currently we're always allowing widening on CPUs without VLX, // because for many cases we don't have a better option. bool canExtendTo512DQ() const { - return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512); + return hasAVX512() && hasEVEX512() && + (!hasVLX() || getPreferVectorWidth() >= 512); } bool canExtendTo512BW() const { return hasBWI() && canExtendTo512DQ(); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index cd40b1d3b09332..be774a89eccbb4 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6080,6 +6080,10 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, for (const Instruction &I : instructions(Callee)) { if (const auto *CB = dyn_cast(&I)) { + // Having more target features is fine for inline ASM. + if (CB->isInlineAsm()) + continue; + SmallVector Types; for (Value *Arg : CB->args()) Types.push_back(Arg->getType()); diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index f1197c29655380..1adef15771fa17 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -321,6 +321,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { return StringSwitch(Part) .Case("0xac3", "ampere1") .Case("0xac4", "ampere1a") + .Case("0xac5", "ampere1b") .Default("generic"); } @@ -1265,8 +1266,10 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, setFeature(X86::FEATURE_AVX2); if (HasLeaf7 && ((EBX >> 8) & 1)) setFeature(X86::FEATURE_BMI2); - if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) + if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) { setFeature(X86::FEATURE_AVX512F); + setFeature(X86::FEATURE_EVEX512); + } if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save) setFeature(X86::FEATURE_AVX512DQ); if (HasLeaf7 && ((EBX >> 19) & 1)) @@ -1771,6 +1774,7 @@ bool sys::getHostCPUFeatures(StringMap &Features) { Features["rtm"] = HasLeaf7 && ((EBX >> 11) & 1); // AVX512 is only supported if the OS supports the context save for it. Features["avx512f"] = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save; + Features["evex512"] = Features["avx512f"]; Features["avx512dq"] = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save; Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1); Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1); diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 3cbe974ff31421..20f324604aa52a 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -294,6 +294,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx12-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; + Features["fp8-conversion-insts"] = true; break; case GK_GFX1151: case GK_GFX1150: diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 8058282c422503..062a3d341007ce 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -652,10 +652,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR, // check to see if the pointer is guaranteed to not be modified from entry of // the function to each of the load instructions. - // Because there could be several/many load instructions, remember which - // blocks we know to be transparent to the load. - df_iterator_default_set TranspBlocks; - for (LoadInst *Load : Loads) { // Check to see if the load is invalidated from the start of the block to // the load itself. @@ -669,7 +665,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR, // To do this, we perform a depth first search on the inverse CFG from the // loading block. for (BasicBlock *P : predecessors(BB)) { - for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks)) + for (BasicBlock *TranspBB : inverse_depth_first(P)) if (AAR.canBasicBlockModify(*TranspBB, Loc)) return false; } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 733f290b1bc93a..633fcb3314c42f 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -470,6 +470,9 @@ class LowerTypeTestsModule { Function *WeakInitializerFn = nullptr; + GlobalVariable *GlobalAnnotation; + DenseSet FunctionAnnotations; + bool shouldExportConstantsAsAbsoluteSymbols(); uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); TypeIdLowering importTypeId(StringRef TypeId); @@ -531,6 +534,10 @@ class LowerTypeTestsModule { /// replace each use, which is a direct function call. void replaceDirectCalls(Value *Old, Value *New); + bool isFunctionAnnotation(Value *V) const { + return FunctionAnnotations.contains(V); + } + public: LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM, ModuleSummaryIndex *ExportSummary, @@ -1377,8 +1384,11 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( // (all?) targets. Switch to a runtime initializer. SmallSetVector GlobalVarUsers; findGlobalVariableUsersOf(F, GlobalVarUsers); - for (auto *GV : GlobalVarUsers) + for (auto *GV : GlobalVarUsers) { + if (GV == GlobalAnnotation) + continue; moveInitializerToModuleConstructor(GV); + } // Can not RAUW F with an expression that uses F. Replace with a temporary // placeholder first. @@ -1837,6 +1847,16 @@ LowerTypeTestsModule::LowerTypeTestsModule( } OS = TargetTriple.getOS(); ObjectFormat = TargetTriple.getObjectFormat(); + + // Function annotation describes or applies to function itself, and + // shouldn't be associated with jump table thunk generated for CFI. + GlobalAnnotation = M.getGlobalVariable("llvm.global.annotations"); + if (GlobalAnnotation && GlobalAnnotation->hasInitializer()) { + const ConstantArray *CA = + cast(GlobalAnnotation->getInitializer()); + for (Value *Op : CA->operands()) + FunctionAnnotations.insert(Op); + } } bool LowerTypeTestsModule::runForTesting(Module &M, ModuleAnalysisManager &AM) { @@ -1896,10 +1916,14 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, if (isa(U.getUser())) continue; - // Skip direct calls to externally defined or non-dso_local functions + // Skip direct calls to externally defined or non-dso_local functions. if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical)) continue; + // Skip function annotation. + if (isFunctionAnnotation(U.getUser())) + continue; + // Must handle Constants specially, we cannot call replaceUsesOfWith on a // constant because they are uniqued. if (auto *C = dyn_cast(U.getUser())) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index a647be2d26c761..bc43edb5e62065 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -412,11 +412,14 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) { // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) { - Align Alignment = cast(II.getArgOperand(2))->getAlignValue(); - StoreInst *S = - new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment); - S->copyMetadata(II); - return S; + if (maskContainsAllOneOrUndef(ConstMask)) { + Align Alignment = + cast(II.getArgOperand(2))->getAlignValue(); + StoreInst *S = new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, + Alignment); + S->copyMetadata(II); + return S; + } } // scatter(vector, splat(ptr), splat(true)) -> store extract(vector, // lastlane), ptr diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 58f0763bb0c0cd..c5d3f60176a826 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2156,14 +2156,14 @@ static bool collectInsertionElements(Value *V, unsigned Shift, Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize); for (unsigned i = 0; i != NumElts; ++i) { - unsigned ShiftI = Shift + i * ElementSize; + unsigned ShiftI = i * ElementSize; Constant *Piece = ConstantFoldBinaryInstruction( Instruction::LShr, C, ConstantInt::get(C->getType(), ShiftI)); if (!Piece) return false; Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); - if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy, + if (!collectInsertionElements(Piece, ShiftI + Shift, Elements, VecEltTy, isBigEndian)) return false; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 8c0fd662255130..9973a80a7db946 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -6491,6 +6491,13 @@ InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, if (!SafeReplacementConstant) SafeReplacementConstant = CI; } + } else if (isa(C->getType())) { + // Handle scalable splat + Value *SplatC = C->getSplatValue(); + auto *CI = dyn_cast_or_null(SplatC); + // Bail out if the constant can't be safely incremented/decremented. + if (!CI || !ConstantIsOk(CI)) + return std::nullopt; } else { // ConstantExpr? return std::nullopt; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index bb2a77daa60a76..1254a050027a45 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1032,7 +1032,8 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) { // where there are several consecutive memory accesses to the same location, // separated by a few arithmetic operations. bool IsLoadCSE = false; - if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) { + BatchAAResults BatchAA(*AA); + if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) { if (IsLoadCSE) combineMetadataForCSE(cast(AvailableVal), &LI, false); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 21bfc91148bfeb..8cc7901cbac7fa 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1284,7 +1284,11 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) { if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ, /* AllowRefinement */ true)) - return replaceOperand(Sel, Swapped ? 2 : 1, V); + // Require either the replacement or the simplification result to be a + // constant to avoid infinite loops. + // FIXME: Make this check more precise. + if (isa(CmpRHS) || isa(V)) + return replaceOperand(Sel, Swapped ? 2 : 1, V); // Even if TrueVal does not simplify, we can directly replace a use of // CmpLHS with CmpRHS, as long as the instruction is not used anywhere @@ -1302,7 +1306,8 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT)) if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ, /* AllowRefinement */ true)) - return replaceOperand(Sel, Swapped ? 2 : 1, V); + if (isa(CmpLHS) || isa(V)) + return replaceOperand(Sel, Swapped ? 2 : 1, V); auto *FalseInst = dyn_cast(FalseVal); if (!FalseInst) @@ -2601,7 +2606,7 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC, // %cnd = icmp slt i32 %rem, 0 // %add = add i32 %rem, %n // %sel = select i1 %cnd, i32 %add, i32 %rem - if (match(TrueVal, m_Add(m_Value(RemRes), m_Value(Remainder))) && + if (match(TrueVal, m_Add(m_Specific(RemRes), m_Value(Remainder))) && match(RemRes, m_SRem(m_Value(Op), m_Specific(Remainder))) && IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero*/ true) && FalseVal == RemRes) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index a8a5f9831e15e3..79873a9b4cbb4c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -802,6 +802,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return InsertNewInstWith(LShr, I->getIterator()); } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one. Known.One |= HighBits; + // SignBits may be out-of-sync with Known.countMinSignBits(). Mask out + // high bits of Known.Zero to avoid conflicts. + Known.Zero &= ~HighBits; } } else { computeKnownBits(I, Known, Depth, CxtI); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 249f4a7710e046..6f0cf9d9c8f187 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1455,6 +1455,7 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI, Value *NewOp, InstCombiner &IC) { Instruction *Clone = I.clone(); Clone->replaceUsesOfWith(SI, NewOp); + Clone->dropUBImplyingAttrsAndMetadata(); IC.InsertNewInstBefore(Clone, SI->getIterator()); return Clone; } @@ -2594,10 +2595,10 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { Value *V; if ((has_single_bit(TyAllocSize) && match(GEP.getOperand(1), - m_Exact(m_AShr(m_Value(V), - m_SpecificInt(countr_zero(TyAllocSize)))))) || + m_Exact(m_Shr(m_Value(V), + m_SpecificInt(countr_zero(TyAllocSize)))))) || match(GEP.getOperand(1), - m_Exact(m_SDiv(m_Value(V), m_SpecificInt(TyAllocSize))))) { + m_Exact(m_IDiv(m_Value(V), m_SpecificInt(TyAllocSize))))) { GetElementPtrInst *NewGEP = GetElementPtrInst::Create( Builder.getInt8Ty(), GEP.getPointerOperand(), V); NewGEP->setIsInBounds(GEP.isInBounds()); diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 8ee0bca7e354f0..0f42ff79086994 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -752,11 +752,12 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { const unsigned ByteSize = 1U << Idx; const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); - Value *Args[] = {Addr, - IRB.CreateIntCast(RMWI->getValOperand(), Ty, false), + Value *Val = RMWI->getValOperand(); + Value *Args[] = {Addr, IRB.CreateBitOrPointerCast(Val, Ty), createOrdering(&IRB, RMWI->getOrdering())}; - CallInst *C = CallInst::Create(F, Args); - ReplaceInstWithInst(I, C); + Value *C = IRB.CreateCall(F, Args); + I->replaceAllUsesWith(IRB.CreateBitOrPointerCast(C, Val->getType())); + I->eraseFromParent(); } else if (AtomicCmpXchgInst *CASI = dyn_cast(I)) { Value *Addr = CASI->getPointerOperand(); Type *OrigOldValTy = CASI->getNewValOperand()->getType(); diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 8f09569d0d9cc9..7b672e89b67aae 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -1061,11 +1061,16 @@ void State::addInfoFor(BasicBlock &BB) { FactOrCheck::getCheck(DT.getNode(&BB), cast(&I))); break; // Enqueue the intrinsics to add extra info. - case Intrinsic::abs: case Intrinsic::umin: case Intrinsic::umax: case Intrinsic::smin: case Intrinsic::smax: + // TODO: Check if it is possible to instead only added the min/max facts + // when simplifying uses of the min/max intrinsics. + if (!isGuaranteedNotToBePoison(&I)) + break; + [[fallthrough]]; + case Intrinsic::abs: WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I)); break; } diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 11a91bfbe5baff..380d6583655367 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -857,6 +857,9 @@ struct DSEState { // no longer be captured. bool ShouldIterateEndOfFunctionDSE; + /// Dead instructions to be removed at the end of DSE. + SmallVector ToRemove; + // Class contains self-reference, make sure it's not copied/moved. DSEState(const DSEState &) = delete; DSEState &operator=(const DSEState &) = delete; @@ -1692,7 +1695,8 @@ struct DSEState { return {MaybeDeadAccess}; } - // Delete dead memory defs + /// Delete dead memory defs and recursively add their operands to ToRemove if + /// they became dead. void deleteDeadInstruction(Instruction *SI) { MemorySSAUpdater Updater(&MSSA); SmallVector NowDeadInsts; @@ -1708,8 +1712,11 @@ struct DSEState { salvageKnowledge(DeadInst); // Remove the Instruction from MSSA. - if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) { - if (MemoryDef *MD = dyn_cast(MA)) { + MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst); + bool IsMemDef = MA && isa(MA); + if (MA) { + if (IsMemDef) { + auto *MD = cast(MA); SkipStores.insert(MD); if (auto *SI = dyn_cast(MD->getMemoryInst())) { if (SI->getValueOperand()->getType()->isPointerTy()) { @@ -1730,13 +1737,21 @@ struct DSEState { // Remove its operands for (Use &O : DeadInst->operands()) if (Instruction *OpI = dyn_cast(O)) { - O = nullptr; + O.set(PoisonValue::get(O->getType())); if (isInstructionTriviallyDead(OpI, &TLI)) NowDeadInsts.push_back(OpI); } EI.removeInstruction(DeadInst); - DeadInst->eraseFromParent(); + // Remove memory defs directly if they don't produce results, but only + // queue other dead instructions for later removal. They may have been + // used as memory locations that have been cached by BatchAA. Removing + // them here may lead to newly created instructions to be allocated at the + // same address, yielding stale cache entries. + if (IsMemDef && DeadInst->getType()->isVoidTy()) + DeadInst->eraseFromParent(); + else + ToRemove.push_back(DeadInst); } } @@ -1892,15 +1907,15 @@ struct DSEState { Malloc->getArgOperand(0), IRB, TLI); if (!Calloc) return false; + MemorySSAUpdater Updater(&MSSA); auto *NewAccess = Updater.createMemoryAccessAfter(cast(Calloc), nullptr, MallocDef); auto *NewAccessMD = cast(NewAccess); Updater.insertDef(NewAccessMD, /*RenameUses=*/true); - Updater.removeMemoryAccess(Malloc); Malloc->replaceAllUsesWith(Calloc); - Malloc->eraseFromParent(); + deleteDeadInstruction(Malloc); return true; } @@ -2233,6 +2248,12 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, MadeChange |= State.eliminateRedundantStoresOfExistingValues(); MadeChange |= State.eliminateDeadWritesAtEndOfFunction(); + + while (!State.ToRemove.empty()) { + Instruction *DeadInst = State.ToRemove.pop_back_val(); + DeadInst->eraseFromParent(); + } + return MadeChange; } } // end anonymous namespace diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 9df28747570c4d..104e8ceb796700 100644 --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -279,6 +279,9 @@ bool InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); + if (!LHS->getType()->isIntegerTy()) + return false; + // Canonicalize to the `Index Pred Invariant` comparison if (IsLoopInvariant(LHS)) { std::swap(LHS, RHS); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 8603c5cf9c022c..87c01ead634ff8 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1260,8 +1260,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // the entry to its block. BasicBlock::iterator BBIt(LoadI); bool IsLoadCSE; + BatchAAResults BatchAA(*AA); + // The dominator tree is updated lazily and may not be valid at this point. + BatchAA.disableDominatorTree(); if (Value *AvailableVal = FindAvailableLoadedValue( - LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { + LoadI, LoadBB, BBIt, DefMaxInstsToScan, &BatchAA, &IsLoadCSE)) { // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. @@ -1322,9 +1325,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB), LocationSize::precise(DL.getTypeStoreSize(AccessTy)), AATags); - PredAvailable = findAvailablePtrLoadStore(Loc, AccessTy, LoadI->isAtomic(), - PredBB, BBIt, DefMaxInstsToScan, - AA, &IsLoadCSE, &NumScanedInst); + PredAvailable = findAvailablePtrLoadStore( + Loc, AccessTy, LoadI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan, + &BatchAA, &IsLoadCSE, &NumScanedInst); // If PredBB has a single predecessor, continue scanning through the // single predecessor. @@ -1336,7 +1339,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { BBIt = SinglePredBB->end(); PredAvailable = findAvailablePtrLoadStore( Loc, AccessTy, LoadI->isAtomic(), SinglePredBB, BBIt, - (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE, + (DefMaxInstsToScan - NumScanedInst), &BatchAA, &IsLoadCSE, &NumScanedInst); } } diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index bdbaf4f55c96d0..17a94f9381bf8e 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2257,6 +2257,41 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, return nullptr; } +static VectorType *createAndCheckVectorTypesForPromotion( + SetVector &OtherTys, ArrayRef CandidateTysCopy, + function_ref CheckCandidateType, Partition &P, + const DataLayout &DL, SmallVectorImpl &CandidateTys, + bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, + bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) { + [[maybe_unused]] VectorType *OriginalElt = + CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr; + // Consider additional vector types where the element type size is a + // multiple of load/store element size. + for (Type *Ty : OtherTys) { + if (!VectorType::isValidElementType(Ty)) + continue; + unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue(); + // Make a copy of CandidateTys and iterate through it, because we + // might append to CandidateTys in the loop. + for (VectorType *const VTy : CandidateTysCopy) { + // The elements in the copy should remain invariant throughout the loop + assert(CandidateTysCopy[0] == OriginalElt && "Different Element"); + unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue(); + unsigned ElementSize = + DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); + if (TypeSize != VectorSize && TypeSize != ElementSize && + VectorSize % TypeSize == 0) { + VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false); + CheckCandidateType(NewVTy); + } + } + } + + return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy, + CommonEltTy, HaveVecPtrTy, + HaveCommonVecPtrTy, CommonVecPtrTy); +} + /// Test whether the given alloca partitioning and range of slices can be /// promoted to a vector. /// @@ -2271,6 +2306,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // we have different element types. SmallVector CandidateTys; SetVector LoadStoreTys; + SetVector DeferredTys; Type *CommonEltTy = nullptr; VectorType *CommonVecPtrTy = nullptr; bool HaveVecPtrTy = false; @@ -2314,42 +2350,32 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { Ty = SI->getValueOperand()->getType(); else continue; + + auto CandTy = Ty->getScalarType(); + if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() || + S.endOffset() != P.endOffset())) { + DeferredTys.insert(Ty); + continue; + } + LoadStoreTys.insert(Ty); // Consider any loads or stores that are the exact size of the slice. if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset()) CheckCandidateType(Ty); } - if (auto *VTy = checkVectorTypesForPromotion( - P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, + SmallVector CandidateTysCopy = CandidateTys; + if (auto *VTy = createAndCheckVectorTypesForPromotion( + LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL, + CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, CommonVecPtrTy)) return VTy; - // Consider additional vector types where the element type size is a - // multiple of load/store element size. - for (Type *Ty : LoadStoreTys) { - if (!VectorType::isValidElementType(Ty)) - continue; - unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue(); - // Make a copy of CandidateTys and iterate through it, because we might - // append to CandidateTys in the loop. - SmallVector CandidateTysCopy = CandidateTys; - CandidateTys.clear(); - for (VectorType *&VTy : CandidateTysCopy) { - unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue(); - unsigned ElementSize = - DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); - if (TypeSize != VectorSize && TypeSize != ElementSize && - VectorSize % TypeSize == 0) { - VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false); - CheckCandidateType(NewVTy); - } - } - } - - return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy, - CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy); + CandidateTys.clear(); + return createAndCheckVectorTypesForPromotion( + DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, + HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, + CommonVecPtrTy); } /// Test whether a slice of an alloca is valid for integer widening. diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 1925b91c4da7ec..c5cb3748a52f8d 100644 --- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -407,6 +407,10 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2, /// form, by inverting the condition and the branch successors. The same /// approach goes for the opposite case. bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { + // We cannot merge the if-region if the merge point has phi nodes. + if (isa(BB->front())) + return false; + BasicBlock *IfTrue2, *IfFalse2; BranchInst *DomBI2 = GetIfCondition(BB, IfTrue2, IfFalse2); if (!DomBI2) @@ -493,16 +497,6 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { PBI->replaceUsesOfWith(PBI->getCondition(), NC); Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); - // Handle PHI node to replace its predecessors to FirstEntryBlock. - for (BasicBlock *Succ : successors(PBI)) { - for (PHINode &Phi : Succ->phis()) { - for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) { - if (Phi.getIncomingBlock(i) == SecondEntryBlock) - Phi.setIncomingBlock(i, FirstEntryBlock); - } - } - } - // Remove IfTrue1 if (IfTrue1 != FirstEntryBlock) { IfTrue1->dropAllReferences(); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 459e3d98059283..a1c6bbc52fd05e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3369,11 +3369,17 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) { // Patch the replacement so that it is not more restrictive than the value // being replaced. + WithOverflowInst *UnusedWO; + // When replacing the result of a llvm.*.with.overflow intrinsic with a + // overflowing binary operator, nuw/nsw flags may no longer hold. + if (isa(ReplInst) && + match(I, m_ExtractValue<0>(m_WithOverflowInst(UnusedWO)))) + ReplInst->dropPoisonGeneratingFlags(); // Note that if 'I' is a load being replaced by some operation, // for example, by an arithmetic operation, then andIRFlags() // would just erase all math flags from the original arithmetic // operation, which is clearly not wanted and not needed. - if (!isa(I)) + else if (!isa(I)) ReplInst->andIRFlags(I); // FIXME: If both the original and replacement value are part of the diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index a1d7f0f9ba0f74..a3951fdf8a1589 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1366,61 +1366,6 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) { return V; } -static bool -canReuseInstruction(ScalarEvolution &SE, const SCEV *S, Instruction *I, - SmallVectorImpl &DropPoisonGeneratingInsts) { - // If the instruction cannot be poison, it's always safe to reuse. - if (programUndefinedIfPoison(I)) - return true; - - // Otherwise, it is possible that I is more poisonous that S. Collect the - // poison-contributors of S, and then check whether I has any additional - // poison-contributors. Poison that is contributed through poison-generating - // flags is handled by dropping those flags instead. - SmallPtrSet PoisonVals; - SE.getPoisonGeneratingValues(PoisonVals, S); - - SmallVector Worklist; - SmallPtrSet Visited; - Worklist.push_back(I); - while (!Worklist.empty()) { - Value *V = Worklist.pop_back_val(); - if (!Visited.insert(V).second) - continue; - - // Avoid walking large instruction graphs. - if (Visited.size() > 16) - return false; - - // Either the value can't be poison, or the S would also be poison if it - // is. - if (PoisonVals.contains(V) || isGuaranteedNotToBePoison(V)) - continue; - - auto *I = dyn_cast(V); - if (!I) - return false; - - // FIXME: Ignore vscale, even though it technically could be poison. Do this - // because SCEV currently assumes it can't be poison. Remove this special - // case once we proper model when vscale can be poison. - if (auto *II = dyn_cast(I); - II && II->getIntrinsicID() == Intrinsic::vscale) - continue; - - if (canCreatePoison(cast(I), /*ConsiderFlagsAndMetadata*/ false)) - return false; - - // If the instruction can't create poison, we can recurse to its operands. - if (I->hasPoisonGeneratingFlagsOrMetadata()) - DropPoisonGeneratingInsts.push_back(I); - - for (Value *Op : I->operands()) - Worklist.push_back(Op); - } - return true; -} - Value *SCEVExpander::FindValueInExprValueMap( const SCEV *S, const Instruction *InsertPt, SmallVectorImpl &DropPoisonGeneratingInsts) { @@ -1448,7 +1393,7 @@ Value *SCEVExpander::FindValueInExprValueMap( continue; // Make sure reusing the instruction is poison-safe. - if (canReuseInstruction(SE, S, EntInst, DropPoisonGeneratingInsts)) + if (SE.canReuseInstruction(S, EntInst, DropPoisonGeneratingInsts)) return V; DropPoisonGeneratingInsts.clear(); } diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 0ed3324a27b6c9..1b142f14d81139 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -713,8 +714,11 @@ bool SimplifyIndvar::replaceFloatIVWithIntegerIV(Instruction *UseInst) { bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand) { if (!SE->isSCEVable(UseInst->getType()) || - (UseInst->getType() != IVOperand->getType()) || - (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand))) + UseInst->getType() != IVOperand->getType()) + return false; + + const SCEV *UseSCEV = SE->getSCEV(UseInst); + if (UseSCEV != SE->getSCEV(IVOperand)) return false; // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the @@ -742,6 +746,16 @@ bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand)) return false; + // Make sure the operand is not more poisonous than the instruction. + if (!impliesPoison(IVOperand, UseInst)) { + SmallVector DropPoisonGeneratingInsts; + if (!SE->canReuseInstruction(UseSCEV, IVOperand, DropPoisonGeneratingInsts)) + return false; + + for (Instruction *I : DropPoisonGeneratingInsts) + I->dropPoisonGeneratingFlagsAndMetadata(); + } + LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n'); SE->forgetValue(UseInst); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6ca93e15719fb2..dd596c567cd482 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1957,6 +1957,8 @@ class GeneratedRTChecks { bool CostTooHigh = false; const bool AddBranchWeights; + Loop *OuterLoop = nullptr; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, const DataLayout &DL, @@ -2053,6 +2055,9 @@ class GeneratedRTChecks { DT->eraseNode(SCEVCheckBlock); LI->removeBlock(SCEVCheckBlock); } + + // Outer loop is used as part of the later cost calculations. + OuterLoop = L->getParentLoop(); } InstructionCost getCost() { @@ -2076,16 +2081,61 @@ class GeneratedRTChecks { LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); RTCheckCost += C; } - if (MemCheckBlock) + if (MemCheckBlock) { + InstructionCost MemCheckCost = 0; for (Instruction &I : *MemCheckBlock) { if (MemCheckBlock->getTerminator() == &I) continue; InstructionCost C = TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); - RTCheckCost += C; + MemCheckCost += C; } + // If the runtime memory checks are being created inside an outer loop + // we should find out if these checks are outer loop invariant. If so, + // the checks will likely be hoisted out and so the effective cost will + // reduce according to the outer loop trip count. + if (OuterLoop) { + ScalarEvolution *SE = MemCheckExp.getSE(); + // TODO: If profitable, we could refine this further by analysing every + // individual memory check, since there could be a mixture of loop + // variant and invariant checks that mean the final condition is + // variant. + const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); + if (SE->isLoopInvariant(Cond, OuterLoop)) { + // It seems reasonable to assume that we can reduce the effective + // cost of the checks even when we know nothing about the trip + // count. Assume that the outer loop executes at least twice. + unsigned BestTripCount = 2; + + // If exact trip count is known use that. + if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop)) + BestTripCount = SmallTC; + else if (LoopVectorizeWithBlockFrequency) { + // Else use profile data if available. + if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop)) + BestTripCount = *EstimatedTC; + } + + InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; + + // Let's ensure the cost is always at least 1. + NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), + (InstructionCost::CostType)1); + + LLVM_DEBUG(dbgs() + << "We expect runtime memory checks to be hoisted " + << "out of the outer loop. Cost reduced from " + << MemCheckCost << " to " << NewMemCheckCost << '\n'); + + MemCheckCost = NewMemCheckCost; + } + } + + RTCheckCost += MemCheckCost; + } + if (SCEVCheckBlock || MemCheckBlock) LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost << "\n"); @@ -2144,8 +2194,8 @@ class GeneratedRTChecks { BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); // Create new preheader for vector loop. - if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) - PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); SCEVCheckBlock->getTerminator()->eraseFromParent(); SCEVCheckBlock->moveBefore(LoopVectorPreHeader); @@ -2179,8 +2229,8 @@ class GeneratedRTChecks { DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); - if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) - PL->addBasicBlockToLoop(MemCheckBlock, *LI); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 601d2454c1e163..1fbd69e38eaeec 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10215,8 +10215,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { UniqueBases.insert(VecBase); // If the only one use is vectorized - can delete the extractelement // itself. - if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { - return !R.ScalarToTreeEntry.count(U); + if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) || + any_of(EI->users(), [&](User *U) { + const TreeEntry *UTE = R.getTreeEntry(U); + return !UTE || R.MultiNodeScalars.contains(U) || + count_if(R.VectorizableTree, + [&](const std::unique_ptr &TE) { + return any_of(TE->UserTreeIndices, + [&](const EdgeInfo &Edge) { + return Edge.UserTE == UTE; + }) && + is_contained(TE->Scalars, EI); + }) != 1; })) continue; R.eraseInstruction(EI); @@ -11643,12 +11653,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1)) TysForDecl.push_back( FixedVectorType::get(CI->getType(), E->Scalars.size())); + auto *CEI = cast(VL0); for (unsigned I : seq(0, CI->arg_size())) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) { - CallInst *CEI = cast(VL0); ScalarArg = CEI->getArgOperand(I); OpVecs.push_back(CEI->getArgOperand(I)); if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) @@ -11661,6 +11671,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } + auto GetOperandSignedness = [&](unsigned Idx) { + const TreeEntry *OpE = getOperandEntry(E, Idx); + bool IsSigned = false; + auto It = MinBWs.find(OpE); + if (It != MinBWs.end()) + IsSigned = It->second.second; + else + IsSigned = any_of(OpE->Scalars, [&](Value *R) { + return !isKnownNonNegative(R, SimplifyQuery(*DL)); + }); + return IsSigned; + }; + ScalarArg = CEI->getArgOperand(I); + if (cast(OpVec->getType())->getElementType() != + ScalarArg->getType()) { + auto *CastTy = FixedVectorType::get(ScalarArg->getType(), + VecTy->getNumElements()); + OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I)); + } LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bbeb5da2cfec3e..ae2fc522ba4002 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -597,13 +597,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a // vector. - // Some vectorized function variants may also take a scalar argument, - // e.g. linear parameters for pointers. Value *Arg; - if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) || - (UseIntrinsic && - isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))) + if (UseIntrinsic && + isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) Arg = State.get(I.value(), VPIteration(0, 0)); + // Some vectorized function variants may also take a scalar argument, + // e.g. linear parameters for pointers. This needs to be the scalar value + // from the start of the respective part when interleaving. + else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) + Arg = State.get(I.value(), VPIteration(Part, 0)); else Arg = State.get(I.value(), Part); if (UseIntrinsic && diff --git a/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll b/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll new file mode 100644 index 00000000000000..cd99065f0285cd --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -mtriple=riscv64 -mattr=+f,+d --passes=loop-unroll-full -S | FileCheck %s + +; Check it doesn't crash when the vector extension is not enabled. +define void @foo() { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 +; CHECK-NEXT: [[SPLAT_SPLAT_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer +; CHECK-NEXT: [[CMP1_I_I_I:%.*]] = fcmp ogt <2 x float> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[SPLAT_SPLAT3_I_I_I:%.*]] = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer +; CHECK-NEXT: [[XOR3_I_I_I_I_I:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4 +; CHECK-NEXT: [[SPLAT_SPLAT8_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer +; CHECK-NEXT: [[SUB_I_I_I:%.*]] = fsub <2 x float> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i64 0, 0 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr null, align 4 +; CHECK-NEXT: [[SPLAT_SPLAT_I_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer +; CHECK-NEXT: [[XOR3_I_I_I_V_I_I:%.*]] = select <2 x i1> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV1]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV1]], 8 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv1 = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %0 = load float, ptr null, align 4 + %splat.splat.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer + %cmp1.i.i.i = fcmp ogt <2 x float> zeroinitializer, zeroinitializer + %splat.splat3.i.i.i = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer + %xor3.i.i.i.i.i = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer + %1 = load float, ptr null, align 4 + %splat.splat8.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer + %sub.i.i.i = fsub <2 x float> zeroinitializer, zeroinitializer + %mul.i.i.i = shl i64 0, 0 + %2 = load float, ptr null, align 4 + %splat.splat.i.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer + %xor3.i.i.i.v.i.i = select <2 x i1> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer + %indvars.iv.next = add i64 %indvars.iv1, 1 + %exitcond = icmp ne i64 %indvars.iv1, 8 + br i1 %exitcond, label %for.body, label %exit + +exit: ; preds = %for.body + ret void +} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll index 98bb5f99a40a1e..fb296f5089422d 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll @@ -7,8 +7,17 @@ define void @test_scope_in_loop(ptr %arg, i64 %num) { ; CHECK-LABEL: 'test_scope_in_loop' ; CHECK-NEXT: loop: -; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Backward loop carried data dependence. ; CHECK-NEXT: Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %load.prev = load i8, ptr %prev.ptr, align 1, !alias.scope !0, !noalias !3 -> +; CHECK-NEXT: store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3 +; CHECK-EMPTY: +; CHECK-NEXT: Forward: +; CHECK-NEXT: %load.cur = load i8, ptr %cur.ptr, align 1, !alias.scope !3 -> +; CHECK-NEXT: store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3 +; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: ; CHECK-EMPTY: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll new file mode 100644 index 00000000000000..106dc8c13a49fa --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; Test case for https://github.com/llvm/llvm-project/issues/82665. +define void @indirect_ptr_recurrences_read_write(ptr %A, ptr %B) { +; CHECK-LABEL: 'indirect_ptr_recurrences_read_write' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Unsafe indirect dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: IndidrectUnsafe: +; CHECK-NEXT: %l = load i32, ptr %ptr.recur, align 4, !tbaa !4 -> +; CHECK-NEXT: store i32 %xor, ptr %ptr.recur, align 4, !tbaa !4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ] + %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ] + %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv + %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6 + %l = load i32, ptr %ptr.recur, align 4, !tbaa !10 + %xor = xor i32 %l, 1 + store i32 %xor, ptr %ptr.recur, align 4, !tbaa !10 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 5 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define i32 @indirect_ptr_recurrences_read_only_loop(ptr %A, ptr %B) { +; CHECK-LABEL: 'indirect_ptr_recurrences_read_only_loop' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ] + %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %xor, %loop ] + %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv + %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6 + %l = load i32, ptr %ptr.recur, align 4, !tbaa !10 + %xor = xor i32 %l, 1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 5 + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %xor +} + +define void @indirect_ptr_recurrences_read_write_may_alias_no_tbaa(ptr %A, ptr %B) { +; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_alias_no_tbaa' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: cannot identify array bounds +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ] + %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ] + %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv + %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6 + %l = load i32, ptr %ptr.recur, align 4 + %xor = xor i32 %l, 1 + store i32 %xor, ptr %ptr.recur, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 5 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @indirect_ptr_recurrences_read_write_may_alias_different_obj(ptr %A, ptr %B, ptr %C) { +; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_alias_different_obj' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: cannot identify array bounds +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ] + %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ] + %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv + %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6 + %l = load i32, ptr %ptr.recur, align 4 + %xor = xor i32 %l, 1 + %gep.C = getelementptr inbounds ptr, ptr %C, i64 %iv + store i32 %xor, ptr %gep.C, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 5 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @indirect_ptr_recurrences_read_write_may_noalias_different_obj(ptr %A, ptr %B, ptr noalias %C) { +; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_noalias_different_obj' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ] + %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ] + %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv + %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6 + %l = load i32, ptr %ptr.recur, align 4 + %xor = xor i32 %l, 1 + %gep.C = getelementptr inbounds ptr, ptr %C, i64 %iv + store i32 %xor, ptr %gep.C, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 5 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + +!6 = !{!7, !7, i64 0} +!7 = !{!"any pointer", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C/C++ TBAA"} +!10 = !{!11, !11, i64 0} +!11 = !{!"int", !8, i64 0} diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index a08ca86c8a619b..4b6f6159ff23f3 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -63,52 +63,140 @@ define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 { ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C) +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { - %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C) + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { - %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) + %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false) store <16 x half> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) + %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false) store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) + %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false) store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { bb: - %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) + %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false) store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 ret void } +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) +define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) +define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) +define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %tmp0, ptr addrspace(1) %out, align 32 + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep) define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: @@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1 -declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 -declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1 -declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1 -declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1 +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1 +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 +declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1 +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1 +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll index b0507e9d075fab..9687ba683fb7e6 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll @@ -35,16 +35,24 @@ define i8 @load_atomic_i8_aligned_monotonic_const(ptr readonly %ptr) { } define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_i8_aligned_acquire: -; CHECK: ldapurb w0, [x0, #4] +; GISEL-LABEL: load_atomic_i8_aligned_acquire: +; GISEL: add x8, x0, #4 +; GISEL: ldaprb w0, [x8] +; +; SDAG-LABEL: load_atomic_i8_aligned_acquire: +; SDAG: ldapurb w0, [x0, #4] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r } define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_i8_aligned_acquire_const: -; CHECK: ldapurb w0, [x0, #4] +; GISEL-LABEL: load_atomic_i8_aligned_acquire_const: +; GISEL: add x8, x0, #4 +; GISEL: ldaprb w0, [x8] +; +; SDAG-LABEL: load_atomic_i8_aligned_acquire_const: +; SDAG: ldapurb w0, [x0, #4] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -101,16 +109,24 @@ define i16 @load_atomic_i16_aligned_monotonic_const(ptr readonly %ptr) { } define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_i16_aligned_acquire: -; CHECK: ldapurh w0, [x0, #8] +; GISEL-LABEL: load_atomic_i16_aligned_acquire: +; GISEL: add x8, x0, #8 +; GISEL: ldaprh w0, [x8] +; +; SDAG-LABEL: load_atomic_i16_aligned_acquire: +; SDAG: ldapurh w0, [x0, #8] %gep = getelementptr inbounds i16, ptr %ptr, i32 4 %r = load atomic i16, ptr %gep acquire, align 2 ret i16 %r } define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_i16_aligned_acquire_const: -; CHECK: ldapurh w0, [x0, #8] +; GISEL-LABEL: load_atomic_i16_aligned_acquire_const: +; GISEL: add x8, x0, #8 +; GISEL: ldaprh w0, [x8] +; +; SDAG-LABEL: load_atomic_i16_aligned_acquire_const: +; SDAG: ldapurh w0, [x0, #8] %gep = getelementptr inbounds i16, ptr %ptr, i32 4 %r = load atomic i16, ptr %gep acquire, align 2 ret i16 %r @@ -367,16 +383,24 @@ define i8 @load_atomic_i8_unaligned_monotonic_const(ptr readonly %ptr) { } define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_i8_unaligned_acquire: -; CHECK: ldapurb w0, [x0, #4] +; GISEL-LABEL: load_atomic_i8_unaligned_acquire: +; GISEL: add x8, x0, #4 +; GISEL: ldaprb w0, [x8] +; +; SDAG-LABEL: load_atomic_i8_unaligned_acquire: +; SDAG: ldapurb w0, [x0, #4] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r } define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_i8_unaligned_acquire_const: -; CHECK: ldapurb w0, [x0, #4] +; GISEL-LABEL: load_atomic_i8_unaligned_acquire_const: +; GISEL: add x8, x0, #4 +; GISEL: ldaprb w0, [x8] +; +; SDAG-LABEL: load_atomic_i8_unaligned_acquire_const: +; SDAG: ldapurb w0, [x0, #4] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -819,7 +843,8 @@ define i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %ptr) { define i8 @load_atomic_i8_from_gep() { ; GISEL-LABEL: load_atomic_i8_from_gep: ; GISEL: bl init -; GISEL: ldapurb w0, [x8, #1] +; GISEL: add x8, x8, #1 +; GISEL: ldaprb w0, [x8] ; ; SDAG-LABEL: load_atomic_i8_from_gep: ; SDAG: bl init @@ -834,7 +859,8 @@ define i8 @load_atomic_i8_from_gep() { define i16 @load_atomic_i16_from_gep() { ; GISEL-LABEL: load_atomic_i16_from_gep: ; GISEL: bl init -; GISEL: ldapurh w0, [x8, #2] +; GISEL: add x8, x8, #2 +; GISEL: ldaprh w0, [x8] ; ; SDAG-LABEL: load_atomic_i16_from_gep: ; SDAG: bl init @@ -884,7 +910,6 @@ define i128 @load_atomic_i128_from_gep() { ; ; SDAG-LABEL: load_atomic_i128_from_gep: ; SDAG: bl init -; SDAG: ldp x0, x1, [sp, #16] ; SDAG: dmb ishld %a = alloca [3 x i128] call void @init(ptr %a) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll index 0e9c126e97a3d8..6152baf2e40f17 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -993,24 +993,24 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 { ; CHECK-NOLSE-O1: ; %bb.0: ; CHECK-NOLSE-O1-NEXT: ldrb w8, [x0, #4095] ; CHECK-NOLSE-O1-NEXT: ldrb w9, [x0, w1, sxtw] -; CHECK-NOLSE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 ; CHECK-NOLSE-O1-NEXT: ldurb w10, [x0, #-256] -; CHECK-NOLSE-O1-NEXT: add w8, w8, w9 -; CHECK-NOLSE-O1-NEXT: ldrb w9, [x11] -; CHECK-NOLSE-O1-NEXT: add w8, w8, w10 -; CHECK-NOLSE-O1-NEXT: add w0, w8, w9 +; CHECK-NOLSE-O1-NEXT: add w8, w9, w8, uxtb +; CHECK-NOLSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 +; CHECK-NOLSE-O1-NEXT: ldrb w9, [x9] +; CHECK-NOLSE-O1-NEXT: add w8, w8, w10, uxtb +; CHECK-NOLSE-O1-NEXT: add w0, w8, w9, uxtb ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_8: ; CHECK-OUTLINE-O1: ; %bb.0: ; CHECK-OUTLINE-O1-NEXT: ldrb w8, [x0, #4095] ; CHECK-OUTLINE-O1-NEXT: ldrb w9, [x0, w1, sxtw] -; CHECK-OUTLINE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 ; CHECK-OUTLINE-O1-NEXT: ldurb w10, [x0, #-256] -; CHECK-OUTLINE-O1-NEXT: add w8, w8, w9 -; CHECK-OUTLINE-O1-NEXT: ldrb w9, [x11] -; CHECK-OUTLINE-O1-NEXT: add w8, w8, w10 -; CHECK-OUTLINE-O1-NEXT: add w0, w8, w9 +; CHECK-OUTLINE-O1-NEXT: add w8, w9, w8, uxtb +; CHECK-OUTLINE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 +; CHECK-OUTLINE-O1-NEXT: ldrb w9, [x9] +; CHECK-OUTLINE-O1-NEXT: add w8, w8, w10, uxtb +; CHECK-OUTLINE-O1-NEXT: add w0, w8, w9, uxtb ; CHECK-OUTLINE-O1-NEXT: ret ; ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_8: @@ -1045,12 +1045,12 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 { ; CHECK-LSE-O1: ; %bb.0: ; CHECK-LSE-O1-NEXT: ldrb w8, [x0, #4095] ; CHECK-LSE-O1-NEXT: ldrb w9, [x0, w1, sxtw] -; CHECK-LSE-O1-NEXT: ldurb w10, [x0, #-256] -; CHECK-LSE-O1-NEXT: add w8, w8, w10 -; CHECK-LSE-O1-NEXT: add w8, w8, w9 +; CHECK-LSE-O1-NEXT: add w8, w9, w8, uxtb +; CHECK-LSE-O1-NEXT: ldurb w9, [x0, #-256] +; CHECK-LSE-O1-NEXT: add w8, w8, w9, uxtb ; CHECK-LSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 ; CHECK-LSE-O1-NEXT: ldrb w9, [x9] -; CHECK-LSE-O1-NEXT: add w0, w8, w9 +; CHECK-LSE-O1-NEXT: add w0, w8, w9, uxtb ; CHECK-LSE-O1-NEXT: ret ; ; CHECK-LSE-O0-LABEL: atomic_load_relaxed_8: @@ -1089,24 +1089,24 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 { ; CHECK-NOLSE-O1: ; %bb.0: ; CHECK-NOLSE-O1-NEXT: ldrh w8, [x0, #8190] ; CHECK-NOLSE-O1-NEXT: ldrh w9, [x0, w1, sxtw #1] -; CHECK-NOLSE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 ; CHECK-NOLSE-O1-NEXT: ldurh w10, [x0, #-256] -; CHECK-NOLSE-O1-NEXT: add w8, w8, w9 -; CHECK-NOLSE-O1-NEXT: ldrh w9, [x11] -; CHECK-NOLSE-O1-NEXT: add w8, w8, w10 -; CHECK-NOLSE-O1-NEXT: add w0, w8, w9 +; CHECK-NOLSE-O1-NEXT: add w8, w9, w8, uxth +; CHECK-NOLSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 +; CHECK-NOLSE-O1-NEXT: ldrh w9, [x9] +; CHECK-NOLSE-O1-NEXT: add w8, w8, w10, uxth +; CHECK-NOLSE-O1-NEXT: add w0, w8, w9, uxth ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_16: ; CHECK-OUTLINE-O1: ; %bb.0: ; CHECK-OUTLINE-O1-NEXT: ldrh w8, [x0, #8190] ; CHECK-OUTLINE-O1-NEXT: ldrh w9, [x0, w1, sxtw #1] -; CHECK-OUTLINE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 ; CHECK-OUTLINE-O1-NEXT: ldurh w10, [x0, #-256] -; CHECK-OUTLINE-O1-NEXT: add w8, w8, w9 -; CHECK-OUTLINE-O1-NEXT: ldrh w9, [x11] -; CHECK-OUTLINE-O1-NEXT: add w8, w8, w10 -; CHECK-OUTLINE-O1-NEXT: add w0, w8, w9 +; CHECK-OUTLINE-O1-NEXT: add w8, w9, w8, uxth +; CHECK-OUTLINE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 +; CHECK-OUTLINE-O1-NEXT: ldrh w9, [x9] +; CHECK-OUTLINE-O1-NEXT: add w8, w8, w10, uxth +; CHECK-OUTLINE-O1-NEXT: add w0, w8, w9, uxth ; CHECK-OUTLINE-O1-NEXT: ret ; ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_16: @@ -1141,12 +1141,12 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 { ; CHECK-LSE-O1: ; %bb.0: ; CHECK-LSE-O1-NEXT: ldrh w8, [x0, #8190] ; CHECK-LSE-O1-NEXT: ldrh w9, [x0, w1, sxtw #1] -; CHECK-LSE-O1-NEXT: ldurh w10, [x0, #-256] -; CHECK-LSE-O1-NEXT: add w8, w8, w10 -; CHECK-LSE-O1-NEXT: add w8, w8, w9 +; CHECK-LSE-O1-NEXT: add w8, w9, w8, uxth +; CHECK-LSE-O1-NEXT: ldurh w9, [x0, #-256] +; CHECK-LSE-O1-NEXT: add w8, w8, w9, uxth ; CHECK-LSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 ; CHECK-LSE-O1-NEXT: ldrh w9, [x9] -; CHECK-LSE-O1-NEXT: add w0, w8, w9 +; CHECK-LSE-O1-NEXT: add w0, w8, w9, uxth ; CHECK-LSE-O1-NEXT: ret ; ; CHECK-LSE-O0-LABEL: atomic_load_relaxed_16: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll index 5a7bd6ee20f9b4..22e283b0e0994a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll @@ -385,13 +385,13 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDRBBui renamable $x0, 4095, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unsigned) - ; CHECK-NEXT: renamable $w9 = LDRBBroW renamable $x0, killed renamable $w1, 1, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_regoff) - ; CHECK-NEXT: renamable $w10 = LDURBBi renamable $x0, -256, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unscaled) - ; CHECK-NEXT: renamable $x11 = ADDXri killed renamable $x0, 291, 12 - ; CHECK-NEXT: $w8 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0 - ; CHECK-NEXT: renamable $w9 = LDRBBui killed renamable $x11, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random) - ; CHECK-NEXT: $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0 - ; CHECK-NEXT: $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0 + ; CHECK-NEXT: renamable $w9 = LDRBBroW renamable $x0, killed renamable $w1, 1, 0 :: (load unordered (s8) from %ir.ptr_regoff) + ; CHECK-NEXT: renamable $w10 = LDURBBi renamable $x0, -256 :: (load monotonic (s8) from %ir.ptr_unscaled) + ; CHECK-NEXT: renamable $w8 = ADDWrx killed renamable $w9, killed renamable $w8, 0, pcsections !0 + ; CHECK-NEXT: renamable $x9 = ADDXri killed renamable $x0, 291, 12 + ; CHECK-NEXT: renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 0, pcsections !0 + ; CHECK-NEXT: renamable $w9 = LDRBBui killed renamable $x9, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random) + ; CHECK-NEXT: renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 0, pcsections !0 ; CHECK-NEXT: RET undef $lr, implicit $w0 %ptr_unsigned = getelementptr i8, ptr %p, i32 4095 %val_unsigned = load atomic i8, ptr %ptr_unsigned monotonic, align 1, !pcsections !0 @@ -417,13 +417,13 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDRHHui renamable $x0, 4095, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unsigned) - ; CHECK-NEXT: renamable $w9 = LDRHHroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s16) from %ir.ptr_regoff) - ; CHECK-NEXT: renamable $w10 = LDURHHi renamable $x0, -256, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unscaled) - ; CHECK-NEXT: renamable $x11 = ADDXri killed renamable $x0, 291, 12 - ; CHECK-NEXT: $w8 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0 - ; CHECK-NEXT: renamable $w9 = LDRHHui killed renamable $x11, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random) - ; CHECK-NEXT: $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0 - ; CHECK-NEXT: $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0 + ; CHECK-NEXT: renamable $w9 = LDRHHroW renamable $x0, killed renamable $w1, 1, 1 :: (load unordered (s16) from %ir.ptr_regoff) + ; CHECK-NEXT: renamable $w10 = LDURHHi renamable $x0, -256 :: (load monotonic (s16) from %ir.ptr_unscaled) + ; CHECK-NEXT: renamable $w8 = ADDWrx killed renamable $w9, killed renamable $w8, 8, pcsections !0 + ; CHECK-NEXT: renamable $x9 = ADDXri killed renamable $x0, 291, 12 + ; CHECK-NEXT: renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 8, pcsections !0 + ; CHECK-NEXT: renamable $w9 = LDRHHui killed renamable $x9, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random) + ; CHECK-NEXT: renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 8, pcsections !0 ; CHECK-NEXT: RET undef $lr, implicit $w0 %ptr_unsigned = getelementptr i16, ptr %p, i32 4095 %val_unsigned = load atomic i16, ptr %ptr_unsigned monotonic, align 2, !pcsections !0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/atomic-anyextending-load-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/atomic-anyextending-load-crash.ll new file mode 100644 index 00000000000000..4bb4e4882410dc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/atomic-anyextending-load-crash.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel -global-isel-abort=1 -O0 -o - %s | FileCheck %s +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64e-apple-macosx14.0.0" + +define void @test(ptr %0) { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #144 +; CHECK-NEXT: stp x29, x30, [sp, #128] ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ldar w8, [x0] +; CHECK-NEXT: str w8, [sp, #116] ; 4-byte Folded Spill +; CHECK-NEXT: mov x8, #0 ; =0x0 +; CHECK-NEXT: str x8, [sp, #120] ; 8-byte Folded Spill +; CHECK-NEXT: blr x8 +; CHECK-NEXT: ldr w11, [sp, #116] ; 4-byte Folded Reload +; CHECK-NEXT: ldr x8, [sp, #120] ; 8-byte Folded Reload +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: str xzr, [x9, #8] +; CHECK-NEXT: str xzr, [x9, #16] +; CHECK-NEXT: str xzr, [x9, #24] +; CHECK-NEXT: str xzr, [x9, #32] +; CHECK-NEXT: str xzr, [x9, #40] +; CHECK-NEXT: ; implicit-def: $x10 +; CHECK-NEXT: mov x10, x11 +; CHECK-NEXT: str x10, [x9, #48] +; CHECK-NEXT: str xzr, [x9, #56] +; CHECK-NEXT: str xzr, [x9, #64] +; CHECK-NEXT: str xzr, [x9, #72] +; CHECK-NEXT: str xzr, [x9, #80] +; CHECK-NEXT: str xzr, [x9, #88] +; CHECK-NEXT: str xzr, [x9, #96] +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: blr x8 +; CHECK-NEXT: ldp x29, x30, [sp, #128] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #144 +; CHECK-NEXT: ret +entry: + %atomic-load = load atomic i32, ptr %0 seq_cst, align 4 + %call10 = call ptr null() + call void (ptr, ...) null(ptr null, ptr null, i32 0, ptr null, ptr null, i32 0, i32 0, i32 %atomic-load, i32 0, i32 0, i32 0, i32 0, i64 0, ptr null) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir index 16b780a8397347..661265173ae82b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir @@ -529,3 +529,27 @@ body: | RET_ReallyLR implicit $q0 ... + +--- +name: pr81244 +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0 + ; CHECK-LABEL: name: pr81244 + ; CHECK: liveins: $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[COPY]](<2 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s8>), [[TRUNC]](<2 x s8>) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS]](<4 x s8>) + ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<4 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 + %0:_(<2 x s32>) = COPY $d0 + %1:_(<2 x s8>) = G_TRUNC %0(<2 x s32>) + %2:_(<4 x s8>) = G_CONCAT_VECTORS %1(<2 x s8>), %1(<2 x s8>) + %3:_(<4 x s16>) = G_ANYEXT %2(<4 x s8>) + $d0 = COPY %3(<4 x s16>) + RET_ReallyLR implicit $d0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll index 07744dada4f1fa..b1166e683ec74e 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll @@ -323,3 +323,22 @@ define i32 @test_alias_3xs16(ptr %ptr, ptr %ptr2, ptr %ptr3, ptr noalias %safe_p store i32 14, ptr %addr4 ret i32 %safeld } + +@G = external global [10 x i32] + +define void @invalid_zero_offset_no_merge(i64 %0) { +; CHECK-LABEL: invalid_zero_offset_no_merge: +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x8, _G@GOTPAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr x8, [x8, _G@GOTPAGEOFF] +; CHECK-NEXT: str wzr, [x8, x0, lsl #2] +; CHECK-NEXT: str wzr, [x8, #4] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdrGot Lloh0, Lloh1 + %2 = getelementptr [10 x i32], ptr @G, i64 0, i64 %0 + store i32 0, ptr %2, align 4 + store i32 0, ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1), align 4 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir index e98e1ce599f2f6..69457a8cc0c19f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir @@ -162,6 +162,13 @@ ret void } + @G = external global [10 x i32] + define void @invalid_zero_offset_no_merge(i64 %0) { + %2 = getelementptr [10 x i32], ptr @G, i64 0, i64 %0 + store i32 0, ptr %2, align 4 + store i32 0, ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1), align 4 + ret void + } ... --- name: test_simple_2xs8 @@ -582,13 +589,11 @@ liveins: frameInfo: maxAlignment: 1 machineFunctionInfo: {} +# The store to ptr2 prevents merging into a single store. +# We can still merge the stores into addr1 and addr2. body: | bb.1 (%ir-block.0): liveins: $x0, $x1 - - ; The store to ptr2 prevents merging into a single store. - ; We can still merge the stores into addr1 and addr2. - ; CHECK-LABEL: name: test_alias_4xs16 ; CHECK: liveins: $x0, $x1 ; CHECK-NEXT: {{ $}} @@ -639,10 +644,10 @@ liveins: frameInfo: maxAlignment: 1 machineFunctionInfo: {} +# Here store of 5 and 9 can be merged, others have aliasing barriers. body: | bb.1 (%ir-block.0): liveins: $x0, $x1, $x2 - ; Here store of 5 and 9 can be merged, others have aliasing barriers. ; CHECK-LABEL: name: test_alias2_4xs16 ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} @@ -698,12 +703,11 @@ liveins: frameInfo: maxAlignment: 1 machineFunctionInfo: {} +# No merging can be done here. body: | bb.1 (%ir-block.0): liveins: $x0, $x1, $x2, $x3 - ; No merging can be done here. - ; CHECK-LABEL: name: test_alias3_4xs16 ; CHECK: liveins: $x0, $x1, $x2, $x3 ; CHECK-NEXT: {{ $}} @@ -767,12 +771,10 @@ stack: - { id: 0, name: a1, size: 24, alignment: 4 } - { id: 1, name: a2, size: 4, alignment: 4 } machineFunctionInfo: {} +# Can merge because the load is from a different alloca and can't alias. body: | bb.1 (%ir-block.0): liveins: $x0 - - ; Can merge because the load is from a different alloca and can't alias. - ; CHECK-LABEL: name: test_alias_allocas_2xs32 ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} @@ -826,3 +828,43 @@ body: | RET_ReallyLR ... +--- +name: invalid_zero_offset_no_merge +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.1 (%ir-block.1): + liveins: $x0 + + ; CHECK-LABEL: name: invalid_zero_offset_no_merge + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @G + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[GV]], [[SHL]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %ir.2) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[GV]], [[C2]](s64) + ; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD1]](p0) :: (store (s32) into `ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1)`) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s64) = COPY $x0 + %9:_(s64) = G_CONSTANT i64 2 + %3:_(s64) = G_SHL %0, %9(s64) + %1:_(p0) = G_GLOBAL_VALUE @G + %4:_(p0) = G_PTR_ADD %1, %3(s64) + %6:_(s32) = G_CONSTANT i32 0 + G_STORE %6(s32), %4(p0) :: (store (s32) into %ir.2) + %8:_(s64) = G_CONSTANT i64 4 + %7:_(p0) = nuw G_PTR_ADD %1, %8(s64) + G_STORE %6(s32), %7(p0) :: (store (s32) into `ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1)`) + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll new file mode 100644 index 00000000000000..a8cba7dc9a91e9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu -stop-after=aarch64-isel < %s -o - | FileCheck %s + +define void @alpha( %x) local_unnamed_addr { +entry: +; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $za + tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"() + ret void +} + +define void @beta( %x) local_unnamed_addr { +entry: +; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $zt0 + tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"() + ret void +} diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll index 2181eaaee7db68..d39029163a47aa 100644 --- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll @@ -214,8 +214,9 @@ define void @t17(i64 %a) { define i8 @LdOffset_i8(ptr %a) { ; CHECK-LABEL: LdOffset_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 -; CHECK-NEXT: ldrb w0, [x8, #3704] +; CHECK-NEXT: mov w8, #56952 // =0xde78 +; CHECK-NEXT: movk w8, #15, lsl #16 +; CHECK-NEXT: ldrb w0, [x0, x8] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 @@ -226,8 +227,9 @@ define i8 @LdOffset_i8(ptr %a) { define i32 @LdOffset_i8_zext32(ptr %a) { ; CHECK-LABEL: LdOffset_i8_zext32: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 -; CHECK-NEXT: ldrb w0, [x8, #3704] +; CHECK-NEXT: mov w8, #56952 // =0xde78 +; CHECK-NEXT: movk w8, #15, lsl #16 +; CHECK-NEXT: ldrb w0, [x0, x8] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 @@ -253,8 +255,9 @@ define i32 @LdOffset_i8_sext32(ptr %a) { define i64 @LdOffset_i8_zext64(ptr %a) { ; CHECK-LABEL: LdOffset_i8_zext64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 -; CHECK-NEXT: ldrb w0, [x8, #3704] +; CHECK-NEXT: mov w8, #56952 // =0xde78 +; CHECK-NEXT: movk w8, #15, lsl #16 +; CHECK-NEXT: ldrb w0, [x0, x8] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index fe4da2e7cf36b5..89c8d540b97e04 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -1848,3 +1848,51 @@ define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) { %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff ret <2 x i128> %absel } + +define <8 x i16> @pr88784(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) { +; CHECK-SD-LABEL: pr88784: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: usubl.8h v0, v0, v1 +; CHECK-SD-NEXT: cmlt.8h v1, v2, #0 +; CHECK-SD-NEXT: ssra.8h v0, v2, #15 +; CHECK-SD-NEXT: eor.16b v0, v1, v0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr88784: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: usubl.8h v0, v0, v1 +; CHECK-GI-NEXT: sshr.8h v1, v2, #15 +; CHECK-GI-NEXT: ssra.8h v0, v2, #15 +; CHECK-GI-NEXT: eor.16b v0, v1, v0 +; CHECK-GI-NEXT: ret + %l4 = zext <8 x i8> %l0 to <8 x i16> + %l5 = ashr <8 x i16> %l2, + %l6 = zext <8 x i8> %l1 to <8 x i16> + %l7 = sub <8 x i16> %l4, %l6 + %l8 = add <8 x i16> %l5, %l7 + %l9 = xor <8 x i16> %l5, %l8 + ret <8 x i16> %l9 +} + +define <8 x i16> @pr88784_fixed(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) { +; CHECK-SD-LABEL: pr88784_fixed: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uabdl.8h v0, v0, v1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr88784_fixed: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: usubl.8h v0, v0, v1 +; CHECK-GI-NEXT: sshr.8h v1, v0, #15 +; CHECK-GI-NEXT: ssra.8h v0, v0, #15 +; CHECK-GI-NEXT: eor.16b v0, v1, v0 +; CHECK-GI-NEXT: ret + %l4 = zext <8 x i8> %l0 to <8 x i16> + %l6 = zext <8 x i8> %l1 to <8 x i16> + %l7 = sub <8 x i16> %l4, %l6 + %l5 = ashr <8 x i16> %l7, + %l8 = add <8 x i16> %l5, %l7 + %l9 = xor <8 x i16> %l5, %l8 + ret <8 x i16> %l9 +} + diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll index 5c56f51e1ca554..bb9ba05f7a2724 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll @@ -147,8 +147,8 @@ define void @has_varargs(...) nounwind { ; CHECK-NEXT: add x29, sp, #160 ; CHECK-NEXT: .seh_add_fp 160 ; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: ldp x8, x5, [x4, #32] -; CHECK-NEXT: mov x4, x8 +; CHECK-NEXT: add x4, x4, #32 +; CHECK-NEXT: mov x5, xzr ; CHECK-NEXT: blr x9 ; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_ret ; CHECK-NEXT: ldr x0, [x8, :lo12:__os_arm64x_dispatch_ret] diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll index dc16b3a1a0f270..844fc52ddade63 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll @@ -100,5 +100,42 @@ define void @varargs_many_argscalleer() nounwind { ret void } +define void @varargs_caller_tail() nounwind { +; CHECK-LABEL: varargs_caller_tail: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: mov x4, sp +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x9, #4617315517961601024 // =0x4014000000000000 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov w1, #2 // =0x2 +; CHECK-NEXT: mov x2, #4613937818241073152 // =0x4008000000000000 +; CHECK-NEXT: mov w3, #4 // =0x4 +; CHECK-NEXT: mov w5, #16 // =0x10 +; CHECK-NEXT: stp xzr, x30, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp x9, x8, [sp] +; CHECK-NEXT: str xzr, [sp, #16] +; CHECK-NEXT: .weak_anti_dep varargs_callee +; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF +; CHECK-NEXT: .weak_anti_dep "#varargs_callee" +; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF +; CHECK-NEXT: bl "#varargs_callee" +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: add x4, sp, #48 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov w1, #4 // =0x4 +; CHECK-NEXT: mov w2, #3 // =0x3 +; CHECK-NEXT: mov w3, #2 // =0x2 +; CHECK-NEXT: mov x5, xzr +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: .weak_anti_dep varargs_callee +; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF +; CHECK-NEXT: .weak_anti_dep "#varargs_callee" +; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF +; CHECK-NEXT: b "#varargs_callee" + call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> ) + tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2) + ret void +} declare void @llvm.va_start(ptr) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll index 93497f38063d28..7b8448de2331b4 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s ; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - | FileCheck %s target triple = "aarch64" @@ -158,6 +159,32 @@ entry: ret <16 x half> %interleaved.vec } + +; Expected not to transform as it is integer +define <16 x i16> @complex_add_v16i16(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: complex_add_v16i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v4.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v5.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: sub v2.8h, v4.8h, v0.8h +; CHECK-NEXT: add v1.8h, v1.8h, v5.8h +; CHECK-NEXT: zip1 v0.8h, v2.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x i16> %b, <16 x i16> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x i16> %b, <16 x i16> zeroinitializer, <8 x i32> + %0 = sub <8 x i16> %b.real, %a.imag + %1 = add <8 x i16> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x i16> %0, <8 x i16> %1, <16 x i32> + ret <16 x i16> %interleaved.vec +} + + declare { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll index b24866064efaea..7b45d0f30bcdd4 100644 --- a/llvm/test/CodeGen/AArch64/cpus.ll +++ b/llvm/test/CodeGen/AArch64/cpus.ll @@ -37,6 +37,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1a 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1b 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID ; CHECK-NOT: {{.*}} is not a recognized processor for this target diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir index 15b6700398ea08..488f1ffdb52f3b 100755 --- a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir +++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir @@ -14,8 +14,9 @@ body: | ; CHECK-LABEL: name: LdOffset ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12 - ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704 + ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0 + ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8 + ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0 ; CHECK-NEXT: RET undef $lr, implicit $w0 renamable $w8 = MOVZWi 56952, 0 renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8 diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll index 23d1e43a5cab1b..cf09a46000dab9 100644 --- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll @@ -7,6 +7,7 @@ ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1 < %s | FileCheck %s ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b < %s | FileCheck %s declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) diff --git a/llvm/test/CodeGen/AArch64/remat.ll b/llvm/test/CodeGen/AArch64/remat.ll index 483c4d71ee21fb..704c87feb6a9b8 100644 --- a/llvm/test/CodeGen/AArch64/remat.ll +++ b/llvm/test/CodeGen/AArch64/remat.ll @@ -26,6 +26,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx3t110 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1a -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1b -o - %s | FileCheck %s %X = type { i64, i64, i64 } declare void @f(ptr) diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index e18e18a1cfad18..381091b4539433 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -23,9 +23,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-FISEL-NEXT: bl streaming_callee ; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstop sm +; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: adrp x8, .LCPI0_0 ; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI0_0] -; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: fadd d0, d1, d0 ; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -49,9 +49,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-GISEL-NEXT: bl streaming_callee ; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstop sm +; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-GISEL-NEXT: fmov d0, x8 -; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: fadd d0, d1, d0 ; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -82,9 +82,9 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: bl normal_callee ; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d0, x8 -; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: fadd d0, d1, d0 ; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -110,14 +110,16 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli ; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl normal_callee ; CHECK-COMMON-NEXT: str d0, [sp, #16] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d0, x8 -; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: fadd d0, d1, d0 ; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm @@ -329,9 +331,9 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-COMMON-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-COMMON-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm @@ -390,9 +392,9 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp s1, s0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf ; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm @@ -420,7 +422,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: bl __arm_sme_state +; CHECK-COMMON-NEXT: ldp s2, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: and x19, x0, #0x1 +; CHECK-COMMON-NEXT: stp s2, s0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2 ; CHECK-COMMON-NEXT: // %bb.1: ; CHECK-COMMON-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll new file mode 100644 index 00000000000000..d5bea725b6d14d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -0,0 +1,1640 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-unknown-eabi-elf" + +; This test verifies that call arguments and results are not coalesced +; with SVE vector registers by the coalescer, such that no 'mul vl' +; ldr/str pairs are generated in the streaming-mode-changing call +; sequence. + +; +; Scalar arguments +; + +define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i8 %arg, i32 0 + call void @use_i8(i8 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i16 %arg, i32 0 + call void @use_i16(i16 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i32 %arg, i32 0 + call void @use_i32(i32 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i64 %arg, i32 0 + call void @use_i64(i64 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: bl use_f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, half %arg, i32 0 + call void @use_f16(half %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl use_f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, float %arg, i32 0 + call void @use_f32(float %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, double %arg, i32 0 + call void @use_f64(double %arg) + store %vec, ptr %ptr + ret void +} + + +; +; Single-element vector arguments +; + +define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v16i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i8> %arg, i32 0 + %vec = insertelement poison, i8 %elt, i32 0 + call void @use_v16i8(<1 x i8> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v8i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i16> %arg, i32 0 + %vec = insertelement poison, i16 %elt, i32 0 + call void @use_v8i16(<1 x i16> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v4i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i32> %arg, i32 0 + %vec = insertelement poison, i32 %elt, i32 0 + call void @use_v4i32(<1 x i32> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v2i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i64> %arg, i32 0 + %vec = insertelement poison, i64 %elt, i32 0 + call void @use_v2i64(<1 x i64> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: bl use_v8f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x half> %arg, i32 0 + %vec = insertelement poison, half %elt, i32 0 + call void @use_v8f16(<1 x half> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v4f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x float> %arg, i32 0 + %vec = insertelement poison, float %elt, i32 0 + call void @use_v4f32(<1 x float> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v2f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x double> %arg, i32 0 + %vec = insertelement poison, double %elt, i32 0 + call void @use_v2f64(<1 x double> %arg) + store %vec, ptr %ptr + ret void +} + +; +; Full vector arguments +; + +define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v16i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv16i8.v16i8( poison, <16 x i8> %arg, i64 0) + call void @use_v16i8(<16 x i8> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8i16.v8i16( poison, <8 x i16> %arg, i64 0) + call void @use_v8i16(<8 x i16> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v4i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> %arg, i64 0) + call void @use_v4i32(<4 x i32> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v2i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %arg, i64 0) + call void @use_v2i64(<2 x i64> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %arg, i64 0) + call void @use_v8f16(<8 x half> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8bf16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8bf16.v8bf16( poison, <8 x bfloat> %arg, i64 0) + call void @use_v8bf16(<8 x bfloat> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v4f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %arg, i64 0) + call void @use_v4f32(<4 x float> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v2f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv2f64.v2f64( poison, <2 x double> %arg, i64 0) + call void @use_v2f64(<2 x double> %arg) + store %vec, ptr %ptr + ret void +} + +; +; <8 x i1> type will need type promotion. +; +define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: and z1.b, z1.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v8i1 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr p0, [x8, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: str p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8i1.v8i1( poison, <8 x i1> %arg, i64 0) + call void @use_v8i1(<8 x i1> %arg) + store %vec, ptr %ptr + ret void +} + +; +; Scalar return values +; + +define void @dont_coalesce_res_i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i8 @get_i8() + %vec = insertelement poison, i8 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i16 @get_i16() + %vec = insertelement poison, i16 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i32 @get_i32() + %vec = insertelement poison, i32 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i64 @get_i64() + %vec = insertelement poison, i64 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f16 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call half @get_f16() + %vec = insertelement poison, half %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f32 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call float @get_f32() + %vec = insertelement poison, float %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call double @get_f64() + %vec = insertelement poison, double %res, i32 0 + store %vec, ptr %ptr + ret void +} + +; +; Single-element vector result values +; + +define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i8 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i8> @get_v1i8() + %elt = extractelement <1 x i8> %res, i32 0 + %vec = insertelement poison, i8 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i16 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i16> @get_v1i16() + %elt = extractelement <1 x i16> %res, i32 0 + %vec = insertelement poison, i16 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i32 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i32> @get_v1i32() + %elt = extractelement <1 x i32> %res, i32 0 + %vec = insertelement poison, i32 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i64> @get_v1i64() + %elt = extractelement <1 x i64> %res, i32 0 + %vec = insertelement poison, i64 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f16 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x half> @get_v1f16() + %elt = extractelement <1 x half> %res, i32 0 + %vec = insertelement poison, half %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f32 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x float> @get_v1f32() + %elt = extractelement <1 x float> %res, i32 0 + %vec = insertelement poison, float %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x double> @get_v1f64() + %elt = extractelement <1 x double> %res, i32 0 + %vec = insertelement poison, double %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +; +; Full vector result values +; + +define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v16i8 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <16 x i8> @get_v16i8() + %vec = call @llvm.vector.insert.nxv16i8.v16i8( poison, <16 x i8> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v8i16 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <8 x i16> @get_v8i16() + %vec = call @llvm.vector.insert.nxv8i16.v8i16( poison, <8 x i16> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v4i32 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <4 x i32> @get_v4i32() + %vec = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v2i64 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x i64> @get_v2i64() + %vec = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v8f16 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <8 x half> @get_v8f16() + %vec = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v4f32 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <4 x float> @get_v4f32() + %vec = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v2f64 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x double> @get_v2f64() + %vec = call @llvm.vector.insert.nxv2f64.v2f64( poison, <2 x double> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +declare half @get_f16() +declare float @get_f32() +declare double @get_f64() +declare <1 x half> @get_v1f16() +declare <1 x float> @get_v1f32() +declare <1 x double> @get_v1f64() +declare <8 x half> @get_v8f16() +declare <4 x float> @get_v4f32() +declare <2 x double> @get_v2f64() + +declare i8 @get_i8() +declare i16 @get_i16() +declare i32 @get_i32() +declare i64 @get_i64() +declare <1 x i8> @get_v1i8() +declare <1 x i16> @get_v1i16() +declare <1 x i32> @get_v1i32() +declare <2 x i64> @get_v1i64() +declare <16 x i8> @get_v16i8() +declare <8 x i16> @get_v8i16() +declare <4 x i32> @get_v4i32() +declare <2 x i64> @get_v2i64() + +declare void @use_f16(half) +declare void @use_f32(float) +declare void @use_f64(double) +declare void @use_v1f16(<1 x half>) +declare void @use_v1f32(<1 x float>) +declare void @use_v1f64(<1 x double>) +declare void @use_v8f16(<8 x half>) +declare void @use_v8bf16(<8 x bfloat>) +declare void @use_v4f32(<4 x float>) +declare void @use_v2f64(<2 x double>) + +declare void @use_i8(i8) +declare void @use_i16(i16) +declare void @use_i32(i32) +declare void @use_i64(i64) +declare void @use_v1i8(<1 x i8>) +declare void @use_v1i16(<1 x i16>) +declare void @use_v1i32(<1 x i32>) +declare void @use_v1i64(<1 x i64>) +declare void @use_v16i8(<16 x i8>) +declare void @use_v8i16(<8 x i16>) +declare void @use_v4i32(<4 x i32>) +declare void @use_v2i64(<2 x i64>) +declare void @use_v8i1(<8 x i1>) + +declare @llvm.vector.insert.nxv8i1.v8i1(, <8 x i1>, i64) +declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.vector.insert.nxv2f64.v2f64(, <2 x double>, i64) + +attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sve,+sme" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index 55dc28f49bf157..93875549cffc86 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -249,11 +249,15 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl cos ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 5d0c9127d3ebb2..296f2be9cfee5e 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,30 +129,37 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl normal_callee_vec_arg -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload @@ -462,7 +469,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: ldp s4, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: stp s4, s0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: ldp d4, d0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: stp d4, d0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB10_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index dd7d6470ad7b08..86918a59f3810e 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -313,9 +313,9 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-NEXT: bl cos ; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm @@ -405,11 +405,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: stp s1, s0, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp d3, d2, [sp, #8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp s1, s0, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldp d3, d2, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: bl bar ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll index 45ca7844b06551..cf171f8ef5ed3a 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -22,9 +22,9 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f diff --git a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll index 2d6db1642247d7..812837639196e6 100644 --- a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll +++ b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s +; RUN: llc -mtriple=arm64ec-windows-msvc %s -o - | FileCheck %s --check-prefixes=CHECK-EC ; RUN: llc -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s ; RUN: llc -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s @@ -32,3 +33,10 @@ attributes #1 = { noinline optnone "thunk" } ; CHECK: ldr x9, [x9] ; CHECK: mov v0.16b, v16.16b ; CHECK: br x9 +; CHECK-EC: mov v7.16b, v0.16b +; CHECK-EC: ldr x9, [x0] +; CHECK-EC: ldr x11, [x9] +; CHECK-EC: mov v0.16b, v7.16b +; CHECK-EC: add x4, sp, #64 +; CHECK-EC: add sp, sp, #64 +; CHECK-EC: br x11 diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 1b22e2f900ddb7..557aa010b3a7d9 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -489,3 +489,31 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { %bitmask = bitcast <6 x i1> %cmp_result to i6 ret i6 %bitmask } + +; Only apply the combine when casting a vector to a scalar. +define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind { +; CHECK-LABEL: vector_to_vector_cast: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: shl.16b v0, v0, #7 +; CHECK-NEXT: Lloh36: +; CHECK-NEXT: adrp x8, lCPI20_0@PAGE +; CHECK-NEXT: Lloh37: +; CHECK-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF] +; CHECK-NEXT: add x8, sp, #14 +; CHECK-NEXT: cmlt.16b v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-NEXT: zip1.16b v0, v0, v1 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: str h0, [sp, #14] +; CHECK-NEXT: ld1.b { v0 }[0], [x8] +; CHECK-NEXT: orr x8, x8, #0x1 +; CHECK-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh36, Lloh37 + %bc = bitcast <16 x i1> %arg to <2 x i8> + ret <2 x i8> %bc +} diff --git a/llvm/test/CodeGen/AArch64/win64-fpowi.ll b/llvm/test/CodeGen/AArch64/win64-fpowi.ll new file mode 100644 index 00000000000000..3eb74f8394ec4e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/win64-fpowi.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=aarch64-pc-windows-msvc19 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-pc-windows-msvc19 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - | FileCheck %s + +define double @powi_f64(double %a, i32 %b) { +; CHECK-LABEL: powi_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: scvtf d1, w0 +; CHECK-NEXT: b pow +entry: + %c = call double @llvm.powi.f64.i32(double %a, i32 %b) + ret double %c +} + +define float @powi_f32(float %a, i32 %b) { +; CHECK-LABEL: powi_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: scvtf s1, w0 +; CHECK-NEXT: b powf +entry: + %c = call float @llvm.powi.f32.i32(float %a, i32 %b) + ret float %c +} + +define half @powi_f16(half %a, i32 %b) { +; CHECK-LABEL: powi_f16: +; CHECK: .seh_proc powi_f16 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: scvtf s1, w0 +; CHECK-NEXT: bl powf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc +entry: + %c = call half @llvm.powi.f16.i32(half %a, i32 %b) + ret half %c +} + +define <2 x double> @powi_v2f64(<2 x double> %a, i32 %b) { +; CHECK-LABEL: powi_v2f64: +; CHECK: .seh_proc powi_v2f64 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .seh_stackalloc 48 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg x30, 32 +; CHECK-NEXT: str d8, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_freg d8, 40 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: scvtf d8, w0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: fmov d1, d8 +; CHECK-NEXT: bl pow +; CHECK-NEXT: fmov d1, d8 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: bl pow +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldr d8, [sp, #40] // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_freg d8, 40 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg x30, 32 +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: .seh_stackalloc 48 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc +entry: + %c = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> %a, i32 %b) + ret <2 x double> %c +} + +define <2 x float> @powi_v2f32(<2 x float> %a, i32 %b) { +; CHECK-LABEL: powi_v2f32: +; CHECK: .seh_proc powi_v2f32 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .seh_stackalloc 48 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg x30, 32 +; CHECK-NEXT: str d8, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_freg d8, 40 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: scvtf s8, w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov s0, v0.s[1] +; CHECK-NEXT: fmov s1, s8 +; CHECK-NEXT: bl powf +; CHECK-NEXT: fmov s1, s8 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: bl powf +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldr d8, [sp, #40] // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_freg d8, 40 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg x30, 32 +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: .seh_stackalloc 48 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc +entry: + %c = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> %a, i32 %b) + ret <2 x float> %c +} + +declare <2 x double> @llvm.powi.v2f64.i32(<2 x double>, i32) +declare <2 x float> @llvm.powi.v2f32.i32(<2 x float>, i32) +declare double @llvm.powi.f64.i32(double, i32) +declare float @llvm.powi.f32.i32(float, i32) +declare half @llvm.powi.f16.i32(half, i32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll new file mode 100644 index 00000000000000..ea377531df2aea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -0,0 +1,504 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x half> %C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %fneg.fabs.C = fneg <8 x float> %fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %fneg.fabs.C = fneg <8 x half> %fabs.C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <8 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] +; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v8 +; GFX12-NEXT: v_lshl_or_b32 v13, v15, 16, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX12-NEXT: v_lshl_or_b32 v15, v19, 16, v16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <16 x half>, ptr %Caddr + %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> + %fneg.C_shuffle = fneg <8 x half> %C_shuffle + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) +declare <8 x float> @llvm.fabs.v8f32(<8 x float>) +declare float @llvm.fabs.f32(float) + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll new file mode 100644 index 00000000000000..6251dfdc392ebc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll new file mode 100644 index 00000000000000..fe6d16bd8b5ead --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll new file mode 100644 index 00000000000000..c80d7a6d9a836e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0) + store <8 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1) + store <8 x half> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0) + store <8 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1) + store <8 x i16> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v11, v[11:12], off +; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll new file mode 100644 index 00000000000000..c4edc5b72b2fbb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll new file mode 100644 index 00000000000000..eafbfb6d1eeb53 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -0,0 +1,459 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x half> %C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %fneg.fabs.C = fneg <4 x float> %fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %fneg.fabs.C = fneg <4 x half> %fabs.C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <4 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX12-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <8 x half>, ptr %Caddr + %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> + %fneg.C_shuffle = fneg <4 x half> %C_shuffle + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare float @llvm.fabs.f32(float) + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll new file mode 100644 index 00000000000000..c4d70fd5f0637f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll @@ -0,0 +1,430 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_movk_i32 s0, 0x80 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s1 +; GFX12-NEXT: v_mov_b32_e32 v4, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll new file mode 100644 index 00000000000000..7e1d09805df3f6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll new file mode 100644 index 00000000000000..b6f1828dce2576 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0) + store <4 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1) + store <4 x half> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2) + store <4 x half> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3) + store <4 x half> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0) + store <4 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1) + store <4 x i16> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2) + store <4 x i16> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3) + store <4 x i16> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0) + store <4 x i32> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0) + store <4 x i32> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v6, v[6:7], off +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: v_mov_b32_e32 v12, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v16, v6 +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll new file mode 100644 index 00000000000000..0d1871a18d4055 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 4aa97c57cbd9c2..5296ad3ab51d31 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1,9 +1,5446 @@ -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s -; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s + +; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671 +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s +; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s -; SDAG-ERR: LLVM ERROR: unsupported libcall legalization -; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_i128_vv) define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { - %shl = sdiv i128 %lhs, %rhs - ret i128 %shl +; GFX9-LABEL: v_sdiv_i128_vv: +; GFX9: ; %bb.0: ; %_udiv-special-cases +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, v16, v1 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 +; GFX9-NEXT: v_xor_b32_e32 v2, v16, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v7 +; GFX9-NEXT: v_xor_b32_e32 v3, v16, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc +; GFX9-NEXT: v_xor_b32_e32 v3, v17, v4 +; GFX9-NEXT: v_xor_b32_e32 v2, v17, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v3, v17 +; GFX9-NEXT: v_xor_b32_e32 v0, v17, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v21, vcc, v2, v17, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, v17, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v17, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v17, vcc +; GFX9-NEXT: v_or_b32_e32 v3, v21, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v20, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v3, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v2, v8, v10 +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_ffbh_u32_e32 v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX9-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX9-NEXT: v_min_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_ffbh_u32_e32 v3, v20 +; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 +; GFX9-NEXT: v_ffbh_u32_e32 v4, v21 +; GFX9-NEXT: v_min_u32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 64, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_ffbh_u32_e32 v5, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_ffbh_u32_e32 v3, v10 +; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 +; GFX9-NEXT: v_min_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_ffbh_u32_e32 v5, v8 +; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX9-NEXT: v_ffbh_u32_e32 v6, v9 +; GFX9-NEXT: v_min_u32_e32 v5, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 64, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v18, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, 0x7f, v2 +; GFX9-NEXT: v_or_b32_e32 v7, v3, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v11, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v7, 0x7f, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v5, vcc +; GFX9-NEXT: v_sub_u32_e32 v12, 64, v7 +; GFX9-NEXT: v_or_b32_e32 v4, v23, v25 +; GFX9-NEXT: v_or_b32_e32 v3, v22, v24 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[8:9] +; GFX9-NEXT: v_sub_u32_e32 v2, 63, v2 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v4, v6, v13 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, v[8:9] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-NEXT: v_sub_u32_e32 v12, 64, v22 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 +; GFX9-NEXT: v_or_b32_e32 v12, v6, v12 +; GFX9-NEXT: v_subrev_u32_e32 v6, 64, v22 +; GFX9-NEXT: v_or_b32_e32 v13, v7, v13 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v6, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v12, vcc +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v21, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, -1, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v1, vcc +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: .LBB0_3: ; %udiv-do-while +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v10, v10, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v3 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14 +; GFX9-NEXT: v_and_b32_e32 v14, v30, v20 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14 +; GFX9-NEXT: v_and_b32_e32 v14, v30, v21 +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc +; GFX9-NEXT: v_and_b32_e32 v14, v30, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v14, vcc +; GFX9-NEXT: v_and_b32_e32 v14, v30, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc +; GFX9-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v14, v22, v24 +; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 +; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-NEXT: ; %bb.4: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB0_5: ; %Flow2 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX9-NEXT: v_or3_b32 v13, v3, 0, v13 +; GFX9-NEXT: v_or3_b32 v12, v2, v4, v12 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v1 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v0 +; GFX9-NEXT: .LBB0_6: ; %Flow3 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_xor_b32_e32 v2, v17, v16 +; GFX9-NEXT: v_xor_b32_e32 v3, v19, v18 +; GFX9-NEXT: v_xor_b32_e32 v0, v6, v2 +; GFX9-NEXT: v_xor_b32_e32 v1, v7, v3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_xor_b32_e32 v5, v12, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v13, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_sdiv_i128_vv: +; GFX9-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v4 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_xor_b32_e64 v11, v3, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v3, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v3, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: s_mov_b32 s9, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: s_mov_b32 s12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s14, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: s_mov_b32 s8, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s10, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: s_mov_b32 s10, s6 +; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s14, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: .LBB0_1: ; %Flow +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB0_5 +; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB0_9 +; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB0_3 +; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 +; GFX9-O0-NEXT: s_mov_b32 s8, s5 +; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-O0-NEXT: s_branch .LBB0_1 +; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s6, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 +; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v15, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v13 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB0_6 +; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 +; GFX9-O0-NEXT: s_mov_b32 s10, 63 +; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-O0-NEXT: s_branch .LBB0_7 +; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v6, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = sdiv i128 %lhs, %rhs + ret i128 %div +} + +define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { +; GFX9-LABEL: v_udiv_i128_vv: +; GFX9: ; %bb.0: ; %_udiv-special-cases +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v9, v5, v7 +; GFX9-NEXT: v_or_b32_e32 v8, v4, v6 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v9, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v8, v0, v2 +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; GFX9-NEXT: v_ffbh_u32_e32 v8, v6 +; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v7 +; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v4 +; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v10, v5 +; GFX9-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_ffbh_u32_e32 v11, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; GFX9-NEXT: v_ffbh_u32_e32 v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_min_u32_e32 v9, v9, v11 +; GFX9-NEXT: v_ffbh_u32_e32 v11, v0 +; GFX9-NEXT: v_add_u32_e32 v11, 32, v11 +; GFX9-NEXT: v_ffbh_u32_e32 v12, v1 +; GFX9-NEXT: v_min_u32_e32 v11, v11, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 64, v11 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, v12, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v8, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v13, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_subbrev_co_u32_e32 v14, vcc, 0, v8, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v8, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13] +; GFX9-NEXT: v_or_b32_e32 v10, v13, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX9-NEXT: v_xor_b32_e32 v9, 0x7f, v12 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v14 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v15, vcc +; GFX9-NEXT: v_sub_u32_e32 v15, 0x7f, v12 +; GFX9-NEXT: v_or_b32_e32 v9, v19, v21 +; GFX9-NEXT: v_or_b32_e32 v8, v18, v20 +; GFX9-NEXT: v_sub_u32_e32 v13, 64, v15 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], v13, v[0:1] +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_sub_u32_e32 v8, 63, v12 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v11, v11, v14 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v13 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-NEXT: v_sub_u32_e32 v14, 64, v18 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v18, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], v14, v[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-NEXT: v_or_b32_e32 v14, v12, v14 +; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v18 +; GFX9-NEXT: v_or_b32_e32 v15, v13, v15 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v18, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: .LBB1_3: ; %udiv-do-while +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshlrev_b64 v[26:27], 1, v[10:11] +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 +; GFX9-NEXT: v_or_b32_e32 v10, v16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v11, v17, v27 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v17 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v24, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v25, v3, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v16 +; GFX9-NEXT: v_and_b32_e32 v16, v26, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v16 +; GFX9-NEXT: v_and_b32_e32 v16, v26, v5 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v16, vcc +; GFX9-NEXT: v_and_b32_e32 v16, v26, v6 +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 +; GFX9-NEXT: v_and_b32_e32 v12, v26, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v16, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v12, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc +; GFX9-NEXT: v_or_b32_e32 v16, v18, v20 +; GFX9-NEXT: v_or_b32_e32 v17, v19, v21 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: ; %bb.4: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB1_5: ; %Flow2 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v11 +; GFX9-NEXT: v_or3_b32 v8, v3, 0, v15 +; GFX9-NEXT: v_or3_b32 v9, v2, v4, v14 +; GFX9-NEXT: v_or_b32_e32 v10, v13, v1 +; GFX9-NEXT: v_or_b32_e32 v11, v12, v0 +; GFX9-NEXT: .LBB1_6: ; %Flow3 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_udiv_i128_vv: +; GFX9-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: s_mov_b32 s9, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: s_mov_b32 s12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s14, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: s_mov_b32 s8, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s10, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: s_mov_b32 s10, s6 +; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s14, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: .LBB1_1: ; %Flow +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB1_5 +; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB1_9 +; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB1_3 +; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 +; GFX9-O0-NEXT: s_mov_b32 s8, s5 +; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-O0-NEXT: s_branch .LBB1_1 +; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s6, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 +; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v15, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v13 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB1_6 +; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 +; GFX9-O0-NEXT: s_mov_b32 s10, 63 +; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-O0-NEXT: s_branch .LBB1_7 +; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = udiv i128 %lhs, %rhs + ret i128 %div +} + +define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { +; GFX9-LABEL: v_srem_i128_vv: +; GFX9: ; %bb.0: ; %_udiv-special-cases +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v20, 31, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 +; GFX9-NEXT: v_xor_b32_e32 v10, v2, v20 +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v20 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v20 +; GFX9-NEXT: v_xor_b32_e32 v9, v3, v20 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v20, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v10, v20, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v4, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v9, v20, vcc +; GFX9-NEXT: v_xor_b32_e32 v5, v5, v8 +; GFX9-NEXT: v_sub_co_u32_e32 v23, vcc, v4, v8 +; GFX9-NEXT: v_xor_b32_e32 v6, v6, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v21, vcc, v5, v8, vcc +; GFX9-NEXT: v_xor_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v8, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc +; GFX9-NEXT: v_or_b32_e32 v7, v21, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v23, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v7, v3, v1 +; GFX9-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] +; GFX9-NEXT: v_ffbh_u32_e32 v6, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX9-NEXT: v_ffbh_u32_e32 v7, v5 +; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_ffbh_u32_e32 v7, v23 +; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX9-NEXT: v_ffbh_u32_e32 v8, v21 +; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 64, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_ffbh_u32_e32 v9, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GFX9-NEXT: v_ffbh_u32_e32 v7, v0 +; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX9-NEXT: v_min_u32_e32 v7, v7, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v10, v3 +; GFX9-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v10, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_subbrev_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: v_or_b32_e32 v13, v7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v22, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6 +; GFX9-NEXT: v_or_b32_e32 v12, v10, v8 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_6 +; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v8, vcc +; GFX9-NEXT: v_sub_u32_e32 v13, 0x7f, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v9, vcc +; GFX9-NEXT: v_sub_u32_e32 v11, 64, v13 +; GFX9-NEXT: v_or_b32_e32 v8, v25, v27 +; GFX9-NEXT: v_or_b32_e32 v7, v24, v26 +; GFX9-NEXT: v_lshlrev_b64 v[9:10], v13, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], v11, v[2:3] +; GFX9-NEXT: v_sub_u32_e32 v6, 63, v6 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v13, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_5 +; GFX9-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v10, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v21, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, -1, v5, vcc +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v19, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: .LBB2_3: ; %udiv-do-while +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 31, v15 +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 31, v7 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v13 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] +; GFX9-NEXT: v_or_b32_e32 v14, v14, v33 +; GFX9-NEXT: v_or3_b32 v6, v6, v8, v10 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v28, v14 +; GFX9-NEXT: v_or_b32_e32 v16, v16, v32 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v29, v15, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v30, v16, vcc +; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v31, v17, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX9-NEXT: v_or_b32_e32 v12, v18, v12 +; GFX9-NEXT: v_and_b32_e32 v18, v8, v23 +; GFX9-NEXT: v_or_b32_e32 v13, v19, v13 +; GFX9-NEXT: v_and_b32_e32 v19, v8, v21 +; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18 +; GFX9-NEXT: v_and_b32_e32 v32, v8, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v19, vcc +; GFX9-NEXT: v_and_b32_e32 v33, v8, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v32, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v33, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, -1, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v26, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v27, vcc +; GFX9-NEXT: v_or_b32_e32 v18, v24, v26 +; GFX9-NEXT: v_or_b32_e32 v19, v25, v27 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX9-NEXT: v_mov_b32_e32 v19, v9 +; GFX9-NEXT: v_or3_b32 v7, v7, 0, v11 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB2_3 +; GFX9-NEXT: ; %bb.4: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_5: ; %Flow2 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v13 +; GFX9-NEXT: v_or3_b32 v11, v7, 0, v11 +; GFX9-NEXT: v_or3_b32 v12, v6, v12, v10 +; GFX9-NEXT: v_or_b32_e32 v10, v9, v15 +; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 +; GFX9-NEXT: .LBB2_6: ; %Flow3 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v21, v13, v[14:15] +; GFX9-NEXT: v_mul_lo_u32 v9, v10, v4 +; GFX9-NEXT: v_mul_lo_u32 v11, v11, v23 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v10, v[13:14] +; GFX9-NEXT: v_add3_u32 v8, v8, v16, v9 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] +; GFX9-NEXT: v_mov_b32_e32 v8, v14 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mul_lo_u32 v12, v12, v21 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9] +; GFX9-NEXT: v_add3_u32 v4, v11, v7, v12 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v5, v0, v20 +; GFX9-NEXT: v_xor_b32_e32 v0, v2, v20 +; GFX9-NEXT: v_xor_b32_e32 v4, v1, v22 +; GFX9-NEXT: v_xor_b32_e32 v1, v3, v22 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v20 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v22, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v20, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v22, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_srem_i128_vv: +; GFX9-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[6:7] +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v11, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v12 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: s_mov_b32 s9, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: s_mov_b32 s12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s14, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: s_mov_b32 s8, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s10, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: s_mov_b32 s10, s6 +; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s14, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-O0-NEXT: s_branch .LBB2_8 +; GFX9-O0-NEXT: .LBB2_1: ; %Flow +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB2_5 +; GFX9-O0-NEXT: .LBB2_3: ; %Flow2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB2_9 +; GFX9-O0-NEXT: .LBB2_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB2_3 +; GFX9-O0-NEXT: .LBB2_5: ; %Flow1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB2_4 +; GFX9-O0-NEXT: .LBB2_6: ; %udiv-do-while +; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 +; GFX9-O0-NEXT: s_mov_b32 s8, s5 +; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execnz .LBB2_6 +; GFX9-O0-NEXT: s_branch .LBB2_1 +; GFX9-O0-NEXT: .LBB2_7: ; %udiv-preheader +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s6, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 +; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v15, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v13 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB2_6 +; GFX9-O0-NEXT: .LBB2_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 +; GFX9-O0-NEXT: s_mov_b32 s10, 63 +; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB2_5 +; GFX9-O0-NEXT: s_branch .LBB2_7 +; GFX9-O0-NEXT: .LBB2_9: ; %udiv-end +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v1, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[17:18] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-O0-NEXT: v_add3_u32 v2, v0, v2, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 +; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v0 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v2, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v17, s[6:7], v11, v12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v6, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v12 +; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v6, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 +; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v23, v11, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v0, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v20 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v19, s[6:7], v6, v19, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v6 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff +; GFX9-O0-NEXT: s_mov_b32 s8, s7 +; GFX9-O0-NEXT: v_and_b32_e64 v19, v19, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: v_and_b32_e64 v21, v20, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v0, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v19 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v23 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v23, v1, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v20 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v19, s[6:7], v1, v19, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v19 +; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], s4, v[0:1] +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v19, v20 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v5, v6 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v6 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v12 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v11, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v6, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = srem i128 %lhs, %rhs + ret i128 %div +} + +define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { +; GFX9-LABEL: v_urem_i128_vv: +; GFX9: ; %bb.0: ; %_udiv-special-cases +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v9, v5, v7 +; GFX9-NEXT: v_or_b32_e32 v8, v4, v6 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v9, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v8, v0, v2 +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; GFX9-NEXT: v_ffbh_u32_e32 v8, v6 +; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v7 +; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v4 +; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v10, v5 +; GFX9-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_ffbh_u32_e32 v11, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; GFX9-NEXT: v_ffbh_u32_e32 v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_min_u32_e32 v9, v9, v11 +; GFX9-NEXT: v_ffbh_u32_e32 v11, v0 +; GFX9-NEXT: v_add_u32_e32 v11, 32, v11 +; GFX9-NEXT: v_ffbh_u32_e32 v12, v1 +; GFX9-NEXT: v_min_u32_e32 v11, v11, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 64, v11 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v12, vcc +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v11, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX9-NEXT: v_xor_b32_e32 v12, 0x7f, v8 +; GFX9-NEXT: v_or_b32_e32 v13, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v12, v12, v10 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v15, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v1, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_6 +; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v10, vcc +; GFX9-NEXT: v_sub_u32_e32 v15, 0x7f, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v11, vcc +; GFX9-NEXT: v_sub_u32_e32 v13, 64, v15 +; GFX9-NEXT: v_or_b32_e32 v10, v23, v25 +; GFX9-NEXT: v_or_b32_e32 v9, v22, v24 +; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], v13, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v8, 63, v8 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v10, v12, v14 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v13 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-NEXT: v_sub_u32_e32 v14, 64, v22 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v22, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], v14, v[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 +; GFX9-NEXT: v_or_b32_e32 v14, v12, v14 +; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v22 +; GFX9-NEXT: v_or_b32_e32 v15, v13, v15 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, v13, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v14, v12, v14, vcc +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v22, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v16, v14, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v12, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, -1, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v20, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v7, vcc +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v21, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: .LBB3_3: ; %udiv-do-while +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] +; GFX9-NEXT: v_or_b32_e32 v10, v20, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v17 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] +; GFX9-NEXT: v_or_b32_e32 v18, v18, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v16, v16, v20 +; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v26, v16 +; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v27, v17, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v28, v18, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v29, v19, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v20 +; GFX9-NEXT: v_and_b32_e32 v20, v30, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, v30, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v20, v30, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v20, v30, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v19, v20, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc +; GFX9-NEXT: v_or_b32_e32 v11, v21, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v20, v22, v24 +; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 +; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 +; GFX9-NEXT: v_mov_b32_e32 v21, v13 +; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v20, v12 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: ; %bb.4: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB3_5: ; %Flow2 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v11 +; GFX9-NEXT: v_or3_b32 v15, v9, 0, v15 +; GFX9-NEXT: v_or3_b32 v14, v8, v10, v14 +; GFX9-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX9-NEXT: v_or_b32_e32 v12, v12, v16 +; GFX9-NEXT: .LBB3_6: ; %Flow3 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v18, v13, v6 +; GFX9-NEXT: v_mul_lo_u32 v16, v15, v4 +; GFX9-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12] +; GFX9-NEXT: v_add3_u32 v10, v10, v19, v18 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10] +; GFX9-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-NEXT: v_mul_lo_u32 v10, v14, v5 +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15] +; GFX9-NEXT: v_add3_u32 v6, v16, v9, v10 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_urem_i128_vv: +; GFX9-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v15, v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v3, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: s_mov_b32 s9, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: s_mov_b32 s12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s14, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[12:13] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v2 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v5, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v4 +; GFX9-O0-NEXT: v_min_u32_e64 v15, v5, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: s_mov_b32 s8, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: s_mov_b32 s10, s11 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[8:9], v11, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: s_mov_b32 s10, s6 +; GFX9-O0-NEXT: s_mov_b32 s11, s7 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v6, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v7, 1 +; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s14, s13 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v2, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-O0-NEXT: s_branch .LBB3_8 +; GFX9-O0-NEXT: .LBB3_1: ; %Flow +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB3_5 +; GFX9-O0-NEXT: .LBB3_3: ; %Flow2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB3_9 +; GFX9-O0-NEXT: .LBB3_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB3_3 +; GFX9-O0-NEXT: .LBB3_5: ; %Flow1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB3_4 +; GFX9-O0-NEXT: .LBB3_6: ; %udiv-do-while +; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 +; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s5, v[23:24] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v23, v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v29 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[29:30], s5, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 +; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v25 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 +; GFX9-O0-NEXT: s_mov_b32 s8, s5 +; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v21, v11, v21 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v23, v11, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v22 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v20 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v19, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v20, vcc, v11, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v21 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v19, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] +; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execnz .LBB3_6 +; GFX9-O0-NEXT: s_branch .LBB3_1 +; GFX9-O0-NEXT: .LBB3_7: ; %udiv-preheader +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: s_mov_b32 s6, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6 +; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v12, vcc, v12, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v17, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v15, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v15, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v13 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_branch .LBB3_6 +; GFX9-O0-NEXT: .LBB3_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s8, s6 +; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f +; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4 +; GFX9-O0-NEXT: s_mov_b32 s10, 63 +; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5] +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v4, v7, s[4:5] +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-O0-NEXT: s_branch .LBB3_7 +; GFX9-O0-NEXT: .LBB3_9: ; %udiv-end +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-O0-NEXT: v_mul_lo_u32 v5, v6, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], s4, v[13:14] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v7, v3 +; GFX9-O0-NEXT: v_mad_u64_u32 v[13:14], s[6:7], v7, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[17:18], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v3, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v8 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_add_co_u32_e64 v13, s[6:7], v11, v12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v8, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v8, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v11, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v8 +; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v5, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[6:7], v7, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v15, s[6:7], v8, v15, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff +; GFX9-O0-NEXT: s_mov_b32 s8, s7 +; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: v_and_b32_e64 v17, v16, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v5, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v6 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v19, v6, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v15, s[6:7], v6, v15, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[5:6] +; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v8, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v7, v8 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v8 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = urem i128 %lhs, %rhs + ret i128 %div +} + +define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { +; GFX9-LABEL: v_sdiv_i128_v_pow2k: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_ashrrev_i64 v[2:3], 33, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k: +; GFX9-O0: ; %bb.0: +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_ashrrev_i64 v[4:5], s4, v[4:5] +; GFX9-O0-NEXT: s_mov_b32 s5, 31 +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s5, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-O0-NEXT: s_mov_b32 s6, s8 +; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: s_mov_b32 s4, 33 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_lshl_or_b32 v0, v2, s5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v2, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = sdiv i128 %lhs, 8589934592 + ret i128 %div +} + +define i128 @v_srem_i128_v_pow2k(i128 %lhs) { +; GFX9-LABEL: v_srem_i128_v_pow2k: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX9-NEXT: v_and_b32_e32 v4, -2, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_srem_i128_v_pow2k: +; GFX9-O0: ; %bb.0: +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7] +; GFX9-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: s_mov_b32 s4, s7 +; GFX9-O0-NEXT: v_add_co_u32_e32 v6, vcc, v5, v4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v2, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v3, v2, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: s_mov_b32 s6, -2 +; GFX9-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: s_mov_b32 s6, s5 +; GFX9-O0-NEXT: v_and_b32_e64 v4, v4, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_and_b32_e64 v9, v6, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = srem i128 %lhs, 8589934592 + ret i128 %div +} + +define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { +; GFX9-LABEL: v_udiv_i128_v_pow2k: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_udiv_i128_v_pow2k: +; GFX9-O0: ; %bb.0: +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: s_mov_b32 s4, 33 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: s_mov_b32 s5, 31 +; GFX9-O0-NEXT: v_lshl_or_b32 v0, v4, s5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[1:2] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v4, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = udiv i128 %lhs, 8589934592 + ret i128 %div +} + +define i128 @v_urem_i128_v_pow2k(i128 %lhs) { +; GFX9-LABEL: v_urem_i128_v_pow2k: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-O0-LABEL: v_urem_i128_v_pow2k: +; GFX9-O0: ; %bb.0: +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr1 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: s_mov_b32 s4, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: s_mov_b32 s6, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_and_b32_e64 v3, v2, s6 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_and_b32_e64 v1, v0, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] + %div = urem i128 %lhs, 8589934592 + ret i128 %div } + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9-SDAG: {{.*}} +; GFX9-SDAG-O0: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll new file mode 100644 index 00000000000000..46e2632e45a190 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -0,0 +1,25 @@ +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s + +; SDAG-ERR: LLVM ERROR: unsupported libcall legalization +; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_v2i128_vv) + +define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { + %shl = sdiv <2 x i128> %lhs, %rhs + ret <2 x i128> %shl +} + +define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { + %shl = udiv <2 x i128> %lhs, %rhs + ret <2 x i128> %shl +} + +define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { + %shl = srem <2 x i128> %lhs, %rhs + ret <2 x i128> %shl +} + +define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { + %shl = urem <2 x i128> %lhs, %rhs + ret <2 x i128> %shl +} diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 380a13ed16128f..47110d94918879 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -55,7 +55,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: v_mov_b32_e32 v31, v0 -; GFX12-NEXT: s_mov_b32 s12, ttmp9 ; GFX12-NEXT: s_mov_b64 s[8:9], 0 ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll new file mode 100644 index 00000000000000..f49fec60892cda --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3) + ret float %ret +} + +define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) +declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) +declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) +declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) + +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir new file mode 100644 index 00000000000000..d11fb27640ee75 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s + +--- +name: test_cvt_f32_bf8_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte0 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_bf8_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte2 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_fp8_byte3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_fp8_byte3 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_pk_bf8_f32_word0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_bf8_f32_word0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_PK_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_BF8_F32_e64 0, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_pk_fp8_f32_word1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_fp8_f32_word1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], [[COPY2]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_bf8_f32_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_bf8_f32_byte0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_SR_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_BF8_F32_e64 0, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_fp8_f32_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_fp8_f32_byte2 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], 0, [[COPY2]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index 26d0d702d99db4..17b1fcf865e94e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) @@ -9,182 +11,524 @@ declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_bf8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_fp8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_fp8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_fp8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0: -; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1: -; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0: -; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1: -; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll new file mode 100644 index 00000000000000..df5533b6295023 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-SDAG-ERR %s +; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-GISEL-ERR %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s + +; GFX9-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.wave.id +; GFX9-GISEL-ERR: LLVM ERROR: unable to legalize instruction: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.wave.id) + +define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) { +; GFX9-LABEL: test_wave_id: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_wave_id: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %waveid = call i32 @llvm.amdgcn.wave.id() + store i32 %waveid, ptr addrspace(1) %out + ret void +} + +define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) { +; GFX9-LABEL: test_wave_id_callable: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_bfe_u32 s34, ttmp8, 0x50019 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_wave_id_callable: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %waveid = call i32 @llvm.amdgcn.wave.id() + store i32 %waveid, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.wave.id() diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll new file mode 100644 index 00000000000000..afa914c8375f64 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s + +define amdgpu_kernel void @workgroup_ids_kernel() { +; GFX9-LABEL: workgroup_ids_kernel: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: +; GFX9ARCH-SDAG: ; %bb.0: ; %.entry +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: workgroup_ids_kernel: +; GFX9ARCH-GISEL: ; %bb.0: ; %.entry +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: workgroup_ids_kernel: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: workgroup_ids_kernel: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + %idz = call i32 @llvm.amdgcn.workgroup.id.z() + %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 + %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 + %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 + call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_kernel void @caller() { +; GFX9-SDAG-LABEL: caller: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s7 +; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 +; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: caller: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s7 +; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 +; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: caller: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: caller: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + call void @callee(i32 %idx) #0 + ret void +} + +declare void @callee(i32) #0 + +define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) { +; GFX9-LABEL: workgroup_ids_device_func: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-SDAG-LABEL: workgroup_ids_device_func: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v6, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: global_store_dword v[0:1], v6, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s4, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: global_store_dword v[2:3], v0, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-SDAG-NEXT: global_store_dword v[4:5], v0, off +; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-GISEL-LABEL: workgroup_ids_device_func: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v6, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s5, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[0:1], v6, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[2:3], v0, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9ARCH-GISEL-NEXT: global_store_dword v[4:5], v0, off +; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: workgroup_ids_device_func: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0 +; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-NEXT: v_mov_b32_e32 v8, s1 +; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %id.x = call i32 @llvm.amdgcn.workgroup.id.x() + %id.y = call i32 @llvm.amdgcn.workgroup.id.y() + %id.z = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %id.x, ptr addrspace(1) %outx + store volatile i32 %id.y, ptr addrspace(1) %outy + store volatile i32 %id.z, ptr addrspace(1) %outz + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() +declare i32 @llvm.amdgcn.workgroup.id.y() +declare i32 @llvm.amdgcn.workgroup.id.z() +declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) + +attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9ARCH: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll new file mode 100644 index 00000000000000..cfff0a969da9e7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s + +define amdgpu_cs void @_amdgpu_cs_main() { +; GFX9-LABEL: _amdgpu_cs_main: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main: +; GFX9ARCH-SDAG: ; %bb.0: ; %.entry +; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: _amdgpu_cs_main: +; GFX9ARCH-GISEL: ; %bb.0: ; %.entry +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: _amdgpu_cs_main: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: _amdgpu_cs_main: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + %idz = call i32 @llvm.amdgcn.workgroup.id.z() + %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 + %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 + %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 + call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_cs void @caller() { +; GFX9-LABEL: caller: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX9ARCH-SDAG-LABEL: caller: +; GFX9ARCH-SDAG: ; %bb.0: +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s10, -1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, s0 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9ARCH-SDAG-NEXT: s_endpgm +; +; GFX9ARCH-GISEL-LABEL: caller: +; GFX9ARCH-GISEL: ; %bb.0: +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s10, -1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, s0 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9ARCH-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + call amdgpu_gfx void @callee(i32 %idx) + ret void +} + +declare amdgpu_gfx void @callee(i32) + +define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) { +; GFX9-LABEL: workgroup_ids_gfx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9ARCH-LABEL: workgroup_ids_gfx: +; GFX9ARCH: ; %bb.0: +; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9ARCH-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: workgroup_ids_gfx: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %id.x = call i32 @llvm.amdgcn.workgroup.id.x() + %id.y = call i32 @llvm.amdgcn.workgroup.id.y() + %id.z = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %id.x, ptr addrspace(1) %outx + store volatile i32 %id.y, ptr addrspace(1) %outy + store volatile i32 %id.z, ptr addrspace(1) %outz + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() +declare i32 @llvm.amdgcn.workgroup.id.y() +declare i32 @llvm.amdgcn.workgroup.id.z() +declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll deleted file mode 100644 index 495b54758de049..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ /dev/null @@ -1,128 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s - -define amdgpu_cs void @_amdgpu_cs_main() { -; GFX9-SDAG-LABEL: _amdgpu_cs_main: -; GFX9-SDAG: ; %bb.0: ; %.entry -; GFX9-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX9-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: _amdgpu_cs_main: -; GFX9-GISEL: ; %bb.0: ; %.entry -; GFX9-GISEL-NEXT: s_mov_b32 s0, ttmp9 -; GFX9-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX9-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: _amdgpu_cs_main: -; GFX12-SDAG: ; %bb.0: ; %.entry -; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: _amdgpu_cs_main: -; GFX12-GISEL: ; %bb.0: ; %.entry -; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 -; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff -; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm -.entry: - %idx = call i32 @llvm.amdgcn.workgroup.id.x() - %idy = call i32 @llvm.amdgcn.workgroup.id.y() - %idz = call i32 @llvm.amdgcn.workgroup.id.z() - %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0 - %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1 - %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2 - call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0) - ret void -} - -define amdgpu_cs void @caller() { -; GFX9-SDAG-LABEL: caller: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] -; GFX9-SDAG-NEXT: s_mov_b32 s8, s0 -; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 -; GFX9-SDAG-NEXT: s_mov_b32 s5, callee@abs32@hi -; GFX9-SDAG-NEXT: s_mov_b32 s4, callee@abs32@lo -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_add_u32 s8, s8, s0 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: caller: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] -; GFX9-GISEL-NEXT: s_mov_b32 s8, s0 -; GFX9-GISEL-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 -; GFX9-GISEL-NEXT: s_mov_b32 s4, callee@abs32@lo -; GFX9-GISEL-NEXT: s_mov_b32 s5, callee@abs32@hi -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_add_u32 s8, s8, s0 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: caller: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo -; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: caller: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi -; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-GISEL-NEXT: s_endpgm - %idx = call i32 @llvm.amdgcn.workgroup.id.x() - call amdgpu_gfx void @callee(i32 %idx) - ret void -} - -declare amdgpu_gfx void @callee(i32) - -declare i32 @llvm.amdgcn.workgroup.id.x() -declare i32 @llvm.amdgcn.workgroup.id.y() -declare i32 @llvm.amdgcn.workgroup.id.z() -declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12: {{.*}} -; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index a59c0394bebe20..ca7486536cf556 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -582,5 +582,170 @@ entry: ret void } +define amdgpu_kernel void @flat_nontemporal_volatile_load( +; GFX7-LABEL: flat_nontemporal_volatile_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_volatile_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_volatile_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_nontemporal_volatile_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_nontemporal_volatile_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_nontemporal_volatile_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_nontemporal_volatile_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load volatile i32, ptr %in, align 4, !nontemporal !0 + store i32 %val, ptr %out + ret void +} + !0 = !{i32 1} declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 07a3f85066991b..b57d539ac18f0b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -576,5 +576,163 @@ entry: ret void } +define amdgpu_kernel void @global_nontemporal_volatile_load( +; GFX6-LABEL: global_nontemporal_volatile_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_nontemporal_volatile_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_nontemporal_volatile_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_nontemporal_volatile_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_nontemporal_volatile_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_nontemporal_volatile_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_nontemporal_volatile_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-WGP-NEXT: s_nop 0 +; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_nontemporal_volatile_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-CU-NEXT: s_nop 0 +; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-CU-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0 + store i32 %val, ptr addrspace(1) %out + ret void +} + !0 = !{i32 1} declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 70c91d26006026..61e0ba8bd968ed 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -615,5 +615,184 @@ entry: ret void } +define amdgpu_kernel void @local_nontemporal_volatile_load( +; GFX6-LABEL: local_nontemporal_volatile_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_nontemporal_volatile_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_nontemporal_volatile_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_nontemporal_volatile_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_nontemporal_volatile_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_nontemporal_volatile_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_clause 0x1 +; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-WGP-NEXT: ds_load_b32 v0, v0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_nontemporal_volatile_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_clause 0x1 +; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-CU-NEXT: ds_load_b32 v0, v0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_nontemporal_volatile_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_clause 0x1 +; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-WGP-NEXT: ds_load_b32 v0, v0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-WGP-NEXT: s_nop 0 +; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_nontemporal_volatile_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_clause 0x1 +; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-CU-NEXT: ds_load_b32 v0, v0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-CU-NEXT: s_nop 0 +; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-CU-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(1) %out) { +entry: + %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0 + store i32 %val, ptr addrspace(1) %out + ret void +} + !0 = !{i32 1} declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 069443791b3e1b..30296a9c3d8963 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -813,5 +813,208 @@ entry: ret void } +define amdgpu_kernel void @private_nontemporal_volatile_load( +; GFX6-LABEL: private_nontemporal_volatile_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_add_u32 s8, s8, s7 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_nontemporal_volatile_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_nontemporal_volatile_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_nontemporal_volatile_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_nontemporal_volatile_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s4 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: private_nontemporal_volatile_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, off, s4 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_nontemporal_volatile_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_clause 0x1 +; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 glc dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_nontemporal_volatile_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_clause 0x1 +; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 glc dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_nontemporal_volatile_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_clause 0x1 +; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-WGP-NEXT: s_nop 0 +; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_nontemporal_volatile_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_clause 0x1 +; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-CU-NEXT: s_nop 0 +; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-CU-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(1) %out) { +entry: + %val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0 + store i32 %val, ptr addrspace(1) %out + ret void +} + !0 = !{i32 1} declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll index 15af1f17e230ec..f1e2737b370ef0 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -84,4 +84,58 @@ entry: ret void } +define amdgpu_kernel void @memset_array_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_array_ptr_alloca( +; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [6 x ptr], align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + +define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_vector_ptr_alloca( +; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca <6 x ptr>, align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + +define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_array_of_array_ptr_alloca( +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false) +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + +define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_array_of_vec_ptr_alloca( +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false) +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll new file mode 100644 index 00000000000000..fb2af36839b5d2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -0,0 +1,499 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x half> %C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <8 x float> %C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <16 x half> %B + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) + %fneg.fabs.C = fneg <8 x float> %fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) + %fneg.fabs.C = fneg <8 x half> %fabs.C + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <8 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <8 x half> %A + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, <8 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16 +; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: v_perm_b32 v15, v15, v14, 0x5040100 +; GFX12-NEXT: v_perm_b32 v14, v13, v12, 0x5040100 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_perm_b32 v13, v19, v18, 0x5040100 +; GFX12-NEXT: v_perm_b32 v12, v17, v16, 0x5040100 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <16 x half>, ptr %Caddr + %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> + %fneg.C_shuffle = fneg <8 x half> %C_shuffle + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out + ret void +} + +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) +declare <8 x float> @llvm.fabs.v8f32(<8 x float>) +declare float @llvm.fabs.f32(float) + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll new file mode 100644 index 00000000000000..51f93b57f38e90 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll @@ -0,0 +1,431 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GFX12-NEXT: v_mov_b32_e32 v17, v10 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GFX12-NEXT: v_mov_b32_e32 v17, v10 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_mov_b32_e32 v13, v10 +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0 +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: v_mov_b32_e32 v13, v10 +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v11, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX12-NEXT: v_mov_b32_e32 v13, v6 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll new file mode 100644 index 00000000000000..48297932aa5984 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll new file mode 100644 index 00000000000000..43538768f00fd9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v20, v[20:21], off +; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0) + store <8 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1) + store <8 x half> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v16, v[16:17], off +; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0) + store <8 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1) + store <8 x i16> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v11, v[11:12], off +; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0) + store <8 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0) + store <8 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v14, v[14:15], off +; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0) + store <8 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1) + store <8 x float> %res1, ptr addrspace(1) %out1 + ret void +} + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll new file mode 100644 index 00000000000000..2db3b07de54d0a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) + store <8 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll new file mode 100644 index 00000000000000..1f2ffaf7627125 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -0,0 +1,456 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x half> %C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.C = fneg <4 x float> %C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <8 x half> %B + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; both neg and abs patterns (wmma matrix C f32 or f16 ) + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) + %fneg.fabs.C = fneg <4 x float> %fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) + %fneg.fabs.C = fneg <4 x half> %fabs.C + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %el3 = extractelement <4 x float> %C, i32 3 + %el3.fabs = call float @llvm.fabs.f32(float %el3) + %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 + %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +; A or B matrix modifier and constant in C + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.A = fneg <4 x half> %A + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %fneg.B = fneg <4 x half> %B + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +; pack f16 elements with v_perm_b32 since they don't come from same b32 + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX12-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %C = load <8 x half>, ptr %Caddr + %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> + %fneg.C_shuffle = fneg <4 x half> %C_shuffle + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) + store <4 x half> %res, <4 x half> addrspace(1)* %out + ret void +} + +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare float @llvm.fabs.f32(float) + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll new file mode 100644 index 00000000000000..fa0a7c98cea323 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll @@ -0,0 +1,373 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v9, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v9, v6 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0 +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, v6 +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v5, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v7, v4 +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll new file mode 100644 index 00000000000000..803453e45ced05 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll new file mode 100644 index 00000000000000..0c5b19a8416fb8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v10, v[10:11], off +; GFX12-NEXT: v_mov_b32_e32 v23, v9 +; GFX12-NEXT: v_mov_b32_e32 v22, v8 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v9 +; GFX12-NEXT: v_mov_b32_e32 v26, v8 +; GFX12-NEXT: v_mov_b32_e32 v25, v7 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v31, v9 +; GFX12-NEXT: v_mov_b32_e32 v30, v8 +; GFX12-NEXT: v_mov_b32_e32 v29, v7 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0) + store <4 x half> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1) + store <4 x half> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2) + store <4 x half> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3) + store <4 x half> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v22, v[8:9], off +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v7 +; GFX12-NEXT: v_mov_b32_e32 v18, v6 +; GFX12-NEXT: v_mov_b32_e32 v21, v7 +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0) + store <4 x i16> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1) + store <4 x i16> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2) + store <4 x i16> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3) + store <4 x i16> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0) + store <4 x i32> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0) + store <4 x i32> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v6, v[6:7], off +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: v_mov_b32_e32 v12, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v16, v6 +; GFX12-NEXT: v_mov_b32_e32 v15, v5 +; GFX12-NEXT: v_mov_b32_e32 v14, v4 +; GFX12-NEXT: v_mov_b32_e32 v13, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <2 x i16> %IndexVec, i32 0 + %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0) + store <4 x i32> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <2 x i16> %IndexVec, i32 1 + %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0) + store <4 x i32> %res1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: global_load_b32 v7, v[7:8], off +; GFX12-NEXT: v_mov_b32_e32 v20, v6 +; GFX12-NEXT: v_mov_b32_e32 v19, v5 +; GFX12-NEXT: v_mov_b32_e32 v18, v4 +; GFX12-NEXT: v_mov_b32_e32 v17, v3 +; GFX12-NEXT: v_mov_b32_e32 v24, v6 +; GFX12-NEXT: v_mov_b32_e32 v23, v5 +; GFX12-NEXT: v_mov_b32_e32 v22, v4 +; GFX12-NEXT: v_mov_b32_e32 v21, v3 +; GFX12-NEXT: v_mov_b32_e32 v28, v6 +; GFX12-NEXT: v_mov_b32_e32 v27, v5 +; GFX12-NEXT: v_mov_b32_e32 v26, v4 +; GFX12-NEXT: v_mov_b32_e32 v25, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 + %Index0 = extractelement <4 x i8> %IndexVec, i32 0 + %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0) + store <4 x float> %res0, ptr addrspace(1) %out0 + %Index1 = extractelement <4 x i8> %IndexVec, i32 1 + %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1) + store <4 x float> %res1, ptr addrspace(1) %out1 + %Index2 = extractelement <4 x i8> %IndexVec, i32 2 + %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2) + store <4 x float> %res2, ptr addrspace(1) %out2 + %Index3 = extractelement <4 x i8> %IndexVec, i32 3 + %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3) + store <4 x float> %res3, ptr addrspace(1) %out3 + ret void +} + +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll new file mode 100644 index 00000000000000..b25f2785133310 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 + +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f16_16x16x16_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { +; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) + store <4 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) + store <4 x i16> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { +; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +bb: + %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) + store <4 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>) +declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) +declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8) +declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8) +declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir new file mode 100644 index 00000000000000..33c32d7290da41 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir @@ -0,0 +1,354 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s + +# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. +# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 +# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1 + +--- +name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... +--- +name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + + ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + + ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir new file mode 100644 index 00000000000000..ab89feb861b5e3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir @@ -0,0 +1,355 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s + +# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. +# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 +# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1 + +--- +name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + + ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + + ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + + ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX12-NEXT: V_NOP_e32 implicit $exec + ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index 769e6b0964abdb..40e4692a18ec79 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -5,43 +5,25 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { -; GFX9-SDAG-LABEL: workgroup_id_x: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm ; -; GFX9-GISEL-LABEL: workgroup_id_x: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: workgroup_id_x: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_x: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: workgroup_id_x: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: workgroup_id_x: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx @@ -52,23 +34,23 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 -; GFX12-NEXT: v_mov_b32_e32 v2, ttmp7 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -81,37 +63,21 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace } define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { -; GFX9-SDAG-LABEL: workgroup_id_xyz: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[2:3] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[6:7] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_id_xyz: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[6:7] -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_xyz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: @@ -119,15 +85,15 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX12-NEXT: global_store_b32 v0, v2, s[6:7] -; GFX12-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: global_store_b32 v1, v2, s[6:7] +; GFX12-NEXT: global_store_b32 v1, v3, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -144,3 +110,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac declare i32 @llvm.amdgcn.workgroup.id.x() declare i32 @llvm.amdgcn.workgroup.id.y() declare i32 @llvm.amdgcn.workgroup.id.z() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll new file mode 100644 index 00000000000000..9494880f990258 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple thumbv7a-none-eabi < %s | FileCheck %s + +@val0 = global i32 0, align 4 +@val1 = global i32 0, align 4 +@val2 = global i32 0, align 4 + +define i32 @foo(ptr %ctx) { +; CHECK-LABEL: foo: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cbz r0, .LBB0_2 +; CHECK-NEXT: @ %bb.1: @ %if.end +; CHECK-NEXT: movw r12, :lower16:val2 +; CHECK-NEXT: movw r3, :lower16:val1 +; CHECK-NEXT: movw r2, :lower16:val0 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movt r12, :upper16:val2 +; CHECK-NEXT: movt r3, :upper16:val1 +; CHECK-NEXT: movt r2, :upper16:val0 +; CHECK-NEXT: str r2, [r1, #4] +; CHECK-NEXT: str r3, [r1, #8] +; CHECK-NEXT: str.w r12, [r1, #12] +; CHECK-NEXT: str r0, [r1, #16] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .LBB0_2: @ %if.then +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bl bar +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: pop {r7, pc} +entry: + %tobool.not = icmp eq ptr %ctx, null + br i1 %tobool.not, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @bar() #2 + br label %return + +if.end: ; preds = %entry + %cmd_a = getelementptr inbounds i8, ptr %ctx, i32 4 + store ptr @val0, ptr %cmd_a, align 4 + %cmd_b = getelementptr inbounds i8, ptr %ctx, i32 8 + store ptr @val1, ptr %cmd_b, align 4 + %cmd_c = getelementptr inbounds i8, ptr %ctx, i32 12 + store ptr @val2, ptr %cmd_c, align 4 + %cmd_d = getelementptr inbounds i8, ptr %ctx, i32 16 + store ptr null, ptr %cmd_d, align 4 + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i32 [ 0, %if.end ], [ -1, %if.then ] + ret i32 %retval.0 +} + +declare void @bar() diff --git a/llvm/test/CodeGen/AVR/bug-81911.ll b/llvm/test/CodeGen/AVR/bug-81911.ll new file mode 100644 index 00000000000000..2a22666a1ff927 --- /dev/null +++ b/llvm/test/CodeGen/AVR/bug-81911.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=avr -mcpu=atmega328 -O1 -verify-machineinstrs | FileCheck %s + +define internal i8 @main() { +; CHECK-LABEL: main: +; CHECK: ; %bb.0: ; %bb0 +; CHECK-NEXT: push r2 +; CHECK-NEXT: push r3 +; CHECK-NEXT: push r4 +; CHECK-NEXT: push r5 +; CHECK-NEXT: push r6 +; CHECK-NEXT: push r7 +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: push r11 +; CHECK-NEXT: push r12 +; CHECK-NEXT: push r13 +; CHECK-NEXT: push r14 +; CHECK-NEXT: push r15 +; CHECK-NEXT: push r16 +; CHECK-NEXT: push r17 +; CHECK-NEXT: push r28 +; CHECK-NEXT: push r29 +; CHECK-NEXT: in r28, 61 +; CHECK-NEXT: in r29, 62 +; CHECK-NEXT: sbiw r28, 13 +; CHECK-NEXT: in r0, 63 +; CHECK-NEXT: cli +; CHECK-NEXT: out 62, r29 +; CHECK-NEXT: out 63, r0 +; CHECK-NEXT: out 61, r28 +; CHECK-NEXT: ldi r16, 0 +; CHECK-NEXT: ldi r17, 0 +; CHECK-NEXT: ldi r18, -1 +; CHECK-NEXT: ;APP +; CHECK-NEXT: ldi r24, 123 +; CHECK-NEXT: ;NO_APP +; CHECK-NEXT: std Y+1, r24 ; 1-byte Folded Spill +; CHECK-NEXT: movw r24, r28 +; CHECK-NEXT: adiw r24, 6 +; CHECK-NEXT: std Y+3, r25 ; 2-byte Folded Spill +; CHECK-NEXT: std Y+2, r24 ; 2-byte Folded Spill +; CHECK-NEXT: movw r8, r16 +; CHECK-NEXT: movw r6, r16 +; CHECK-NEXT: movw r4, r16 +; CHECK-NEXT: movw r2, r16 +; CHECK-NEXT: rjmp .LBB0_2 +; CHECK-NEXT: .LBB0_1: ; %bb1 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: andi r30, 1 +; CHECK-NEXT: ldd r31, Y+4 ; 1-byte Folded Reload +; CHECK-NEXT: dec r31 +; CHECK-NEXT: cpi r30, 0 +; CHECK-NEXT: movw r8, r18 +; CHECK-NEXT: movw r6, r20 +; CHECK-NEXT: movw r4, r22 +; CHECK-NEXT: movw r2, r24 +; CHECK-NEXT: mov r18, r31 +; CHECK-NEXT: brne .LBB0_2 +; CHECK-NEXT: rjmp .LBB0_4 +; CHECK-NEXT: .LBB0_2: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: std Y+4, r18 ; 1-byte Folded Spill +; CHECK-NEXT: movw r18, r8 +; CHECK-NEXT: movw r20, r6 +; CHECK-NEXT: movw r22, r4 +; CHECK-NEXT: movw r24, r2 +; CHECK-NEXT: ldi r26, 10 +; CHECK-NEXT: ldi r27, 0 +; CHECK-NEXT: movw r10, r26 +; CHECK-NEXT: movw r12, r16 +; CHECK-NEXT: movw r14, r16 +; CHECK-NEXT: call __udivdi3 +; CHECK-NEXT: std Y+13, r25 +; CHECK-NEXT: std Y+12, r24 +; CHECK-NEXT: std Y+11, r23 +; CHECK-NEXT: std Y+10, r22 +; CHECK-NEXT: std Y+9, r21 +; CHECK-NEXT: std Y+8, r20 +; CHECK-NEXT: std Y+7, r19 +; CHECK-NEXT: std Y+6, r18 +; CHECK-NEXT: ldd r30, Y+2 ; 2-byte Folded Reload +; CHECK-NEXT: ldd r31, Y+3 ; 2-byte Folded Reload +; CHECK-NEXT: ;APP +; CHECK-NEXT: ;NO_APP +; CHECK-NEXT: ldi r30, 1 +; CHECK-NEXT: cp r8, r1 +; CHECK-NEXT: cpc r9, r1 +; CHECK-NEXT: cpc r6, r16 +; CHECK-NEXT: cpc r7, r17 +; CHECK-NEXT: cpc r4, r16 +; CHECK-NEXT: cpc r5, r17 +; CHECK-NEXT: cpc r2, r16 +; CHECK-NEXT: cpc r3, r17 +; CHECK-NEXT: breq .LBB0_3 +; CHECK-NEXT: rjmp .LBB0_1 +; CHECK-NEXT: .LBB0_3: ; %bb1 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: mov r30, r1 +; CHECK-NEXT: rjmp .LBB0_1 +; CHECK-NEXT: .LBB0_4: ; %bb3 +; CHECK-NEXT: ldd r24, Y+1 ; 1-byte Folded Reload +; CHECK-NEXT: std Y+5, r24 +; CHECK-NEXT: movw r24, r28 +; CHECK-NEXT: adiw r24, 5 +; CHECK-NEXT: ;APP +; CHECK-NEXT: ;NO_APP +; CHECK-NEXT: ldd r24, Y+5 +; CHECK-NEXT: adiw r28, 13 +; CHECK-NEXT: in r0, 63 +; CHECK-NEXT: cli +; CHECK-NEXT: out 62, r29 +; CHECK-NEXT: out 63, r0 +; CHECK-NEXT: out 61, r28 +; CHECK-NEXT: pop r29 +; CHECK-NEXT: pop r28 +; CHECK-NEXT: pop r17 +; CHECK-NEXT: pop r16 +; CHECK-NEXT: pop r15 +; CHECK-NEXT: pop r14 +; CHECK-NEXT: pop r13 +; CHECK-NEXT: pop r12 +; CHECK-NEXT: pop r11 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: pop r7 +; CHECK-NEXT: pop r6 +; CHECK-NEXT: pop r5 +; CHECK-NEXT: pop r4 +; CHECK-NEXT: pop r3 +; CHECK-NEXT: pop r2 +; CHECK-NEXT: ret +bb0: + %0 = alloca i64 + %1 = alloca i8 + %2 = tail call i8 asm sideeffect "ldi ${0}, 123", "=&r,~{sreg},~{memory}"() + + br label %bb1 + +bb1: + %3 = phi i64 [ %5, %bb1 ], [ 0, %bb0 ] + %4 = phi i8 [ %6, %bb1 ], [ 0, %bb0 ] + %5 = udiv i64 %3, 10 + %6 = add i8 %4, 1 + + store i64 %5, ptr %0 + call void asm sideeffect "", "r,~{memory}"(ptr %0) + + %7 = icmp eq i64 %3, 0 + %8 = icmp eq i8 %6, 0 + + br i1 %7, label %bb3, label %bb1 + +bb3: + store i8 %2, ptr %1 + call void asm sideeffect "", "r,~{memory}"(ptr %1) + + %9 = load i8, ptr %1 + + ret i8 %9 +} diff --git a/llvm/test/CodeGen/LoongArch/addrspacecast.ll b/llvm/test/CodeGen/LoongArch/addrspacecast.ll new file mode 100644 index 00000000000000..7875562331be09 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/addrspacecast.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --verify-machineinstrs < %s | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s | FileCheck %s --check-prefix=LA64 + +define void @cast0(ptr addrspace(1) %ptr) { +; LA32-LABEL: cast0: +; LA32: # %bb.0: +; LA32-NEXT: st.w $zero, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: cast0: +; LA64: # %bb.0: +; LA64-NEXT: st.w $zero, $a0, 0 +; LA64-NEXT: ret + %ptr0 = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(0) + store i32 0, ptr %ptr0 + ret void +} + +define void @cast1(ptr %ptr) { +; LA32-LABEL: cast1: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: bl %plt(foo) +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: cast1: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: bl %plt(foo) +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret + %castptr = addrspacecast ptr %ptr to ptr addrspace(10) + call void @foo(ptr addrspace(10) %castptr) + ret void +} + +declare void @foo(ptr addrspace(10)) diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll index b0f29ee790885d..b84c1093eb75f2 100644 --- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll @@ -25,15 +25,16 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; LA64-NEXT: andi $a5, $a5, 255 ; LA64-NEXT: sll.w $a5, $a5, $a3 ; LA64-NEXT: and $a6, $a2, $a4 -; LA64-NEXT: or $a6, $a6, $a5 +; LA64-NEXT: or $a5, $a6, $a5 +; LA64-NEXT: addi.w $a6, $a2, 0 ; LA64-NEXT: .LBB0_3: # %atomicrmw.start ; LA64-NEXT: # Parent Loop BB0_1 Depth=1 ; LA64-NEXT: # => This Inner Loop Header: Depth=2 -; LA64-NEXT: ll.w $a5, $a0, 0 -; LA64-NEXT: bne $a5, $a2, .LBB0_5 +; LA64-NEXT: ll.w $a2, $a0, 0 +; LA64-NEXT: bne $a2, $a6, .LBB0_5 ; LA64-NEXT: # %bb.4: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB0_3 Depth=2 -; LA64-NEXT: move $a7, $a6 +; LA64-NEXT: move $a7, $a5 ; LA64-NEXT: sc.w $a7, $a0, 0 ; LA64-NEXT: beqz $a7, .LBB0_3 ; LA64-NEXT: b .LBB0_6 @@ -42,11 +43,9 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB0_6: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB0_1 Depth=1 -; LA64-NEXT: addi.w $a6, $a2, 0 -; LA64-NEXT: move $a2, $a5 -; LA64-NEXT: bne $a5, $a6, .LBB0_1 +; LA64-NEXT: bne $a2, $a6, .LBB0_1 ; LA64-NEXT: # %bb.2: # %atomicrmw.end -; LA64-NEXT: srl.w $a0, $a5, $a3 +; LA64-NEXT: srl.w $a0, $a2, $a3 ; LA64-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -77,15 +76,16 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; LA64-NEXT: bstrpick.d $a5, $a5, 15, 0 ; LA64-NEXT: sll.w $a5, $a5, $a3 ; LA64-NEXT: and $a6, $a2, $a4 -; LA64-NEXT: or $a6, $a6, $a5 +; LA64-NEXT: or $a5, $a6, $a5 +; LA64-NEXT: addi.w $a6, $a2, 0 ; LA64-NEXT: .LBB1_3: # %atomicrmw.start ; LA64-NEXT: # Parent Loop BB1_1 Depth=1 ; LA64-NEXT: # => This Inner Loop Header: Depth=2 -; LA64-NEXT: ll.w $a5, $a0, 0 -; LA64-NEXT: bne $a5, $a2, .LBB1_5 +; LA64-NEXT: ll.w $a2, $a0, 0 +; LA64-NEXT: bne $a2, $a6, .LBB1_5 ; LA64-NEXT: # %bb.4: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB1_3 Depth=2 -; LA64-NEXT: move $a7, $a6 +; LA64-NEXT: move $a7, $a5 ; LA64-NEXT: sc.w $a7, $a0, 0 ; LA64-NEXT: beqz $a7, .LBB1_3 ; LA64-NEXT: b .LBB1_6 @@ -94,11 +94,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB1_6: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB1_1 Depth=1 -; LA64-NEXT: addi.w $a6, $a2, 0 -; LA64-NEXT: move $a2, $a5 -; LA64-NEXT: bne $a5, $a6, .LBB1_1 +; LA64-NEXT: bne $a2, $a6, .LBB1_1 ; LA64-NEXT: # %bb.2: # %atomicrmw.end -; LA64-NEXT: srl.w $a0, $a5, $a3 +; LA64-NEXT: srl.w $a0, $a2, $a3 ; LA64-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst ret i16 %result @@ -107,37 +105,36 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) { ; LA64-LABEL: atomicrmw_uinc_wrap_i32: ; LA64: # %bb.0: -; LA64-NEXT: ld.w $a3, $a0, 0 -; LA64-NEXT: addi.w $a2, $a1, 0 +; LA64-NEXT: ld.w $a2, $a0, 0 +; LA64-NEXT: addi.w $a1, $a1, 0 ; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB2_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB2_3 Depth 2 -; LA64-NEXT: addi.w $a4, $a3, 0 -; LA64-NEXT: sltu $a1, $a4, $a2 -; LA64-NEXT: xori $a1, $a1, 1 -; LA64-NEXT: addi.d $a5, $a3, 1 -; LA64-NEXT: masknez $a5, $a5, $a1 +; LA64-NEXT: addi.w $a3, $a2, 0 +; LA64-NEXT: sltu $a4, $a3, $a1 +; LA64-NEXT: xori $a4, $a4, 1 +; LA64-NEXT: addi.d $a2, $a2, 1 +; LA64-NEXT: masknez $a4, $a2, $a4 ; LA64-NEXT: .LBB2_3: # %atomicrmw.start ; LA64-NEXT: # Parent Loop BB2_1 Depth=1 ; LA64-NEXT: # => This Inner Loop Header: Depth=2 -; LA64-NEXT: ll.w $a1, $a0, 0 -; LA64-NEXT: bne $a1, $a3, .LBB2_5 +; LA64-NEXT: ll.w $a2, $a0, 0 +; LA64-NEXT: bne $a2, $a3, .LBB2_5 ; LA64-NEXT: # %bb.4: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB2_3 Depth=2 -; LA64-NEXT: move $a6, $a5 -; LA64-NEXT: sc.w $a6, $a0, 0 -; LA64-NEXT: beqz $a6, .LBB2_3 +; LA64-NEXT: move $a5, $a4 +; LA64-NEXT: sc.w $a5, $a0, 0 +; LA64-NEXT: beqz $a5, .LBB2_3 ; LA64-NEXT: b .LBB2_6 ; LA64-NEXT: .LBB2_5: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB2_1 Depth=1 ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB2_6: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB2_1 Depth=1 -; LA64-NEXT: move $a3, $a1 -; LA64-NEXT: bne $a1, $a4, .LBB2_1 +; LA64-NEXT: bne $a2, $a3, .LBB2_1 ; LA64-NEXT: # %bb.2: # %atomicrmw.end -; LA64-NEXT: move $a0, $a1 +; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i32 %val seq_cst ret i32 %result @@ -209,15 +206,16 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; LA64-NEXT: andi $a6, $a6, 255 ; LA64-NEXT: sll.w $a6, $a6, $a3 ; LA64-NEXT: and $a7, $a2, $a4 -; LA64-NEXT: or $a7, $a7, $a6 +; LA64-NEXT: or $a6, $a7, $a6 +; LA64-NEXT: addi.w $a7, $a2, 0 ; LA64-NEXT: .LBB4_3: # %atomicrmw.start ; LA64-NEXT: # Parent Loop BB4_1 Depth=1 ; LA64-NEXT: # => This Inner Loop Header: Depth=2 -; LA64-NEXT: ll.w $a6, $a0, 0 -; LA64-NEXT: bne $a6, $a2, .LBB4_5 +; LA64-NEXT: ll.w $a2, $a0, 0 +; LA64-NEXT: bne $a2, $a7, .LBB4_5 ; LA64-NEXT: # %bb.4: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB4_3 Depth=2 -; LA64-NEXT: move $t0, $a7 +; LA64-NEXT: move $t0, $a6 ; LA64-NEXT: sc.w $t0, $a0, 0 ; LA64-NEXT: beqz $t0, .LBB4_3 ; LA64-NEXT: b .LBB4_6 @@ -226,11 +224,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB4_6: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB4_1 Depth=1 -; LA64-NEXT: addi.w $a7, $a2, 0 -; LA64-NEXT: move $a2, $a6 -; LA64-NEXT: bne $a6, $a7, .LBB4_1 +; LA64-NEXT: bne $a2, $a7, .LBB4_1 ; LA64-NEXT: # %bb.2: # %atomicrmw.end -; LA64-NEXT: srl.w $a0, $a6, $a3 +; LA64-NEXT: srl.w $a0, $a2, $a3 ; LA64-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -266,15 +262,16 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; LA64-NEXT: bstrpick.d $a6, $a6, 15, 0 ; LA64-NEXT: sll.w $a6, $a6, $a3 ; LA64-NEXT: and $a7, $a2, $a4 -; LA64-NEXT: or $a7, $a7, $a6 +; LA64-NEXT: or $a6, $a7, $a6 +; LA64-NEXT: addi.w $a7, $a2, 0 ; LA64-NEXT: .LBB5_3: # %atomicrmw.start ; LA64-NEXT: # Parent Loop BB5_1 Depth=1 ; LA64-NEXT: # => This Inner Loop Header: Depth=2 -; LA64-NEXT: ll.w $a6, $a0, 0 -; LA64-NEXT: bne $a6, $a2, .LBB5_5 +; LA64-NEXT: ll.w $a2, $a0, 0 +; LA64-NEXT: bne $a2, $a7, .LBB5_5 ; LA64-NEXT: # %bb.4: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB5_3 Depth=2 -; LA64-NEXT: move $t0, $a7 +; LA64-NEXT: move $t0, $a6 ; LA64-NEXT: sc.w $t0, $a0, 0 ; LA64-NEXT: beqz $t0, .LBB5_3 ; LA64-NEXT: b .LBB5_6 @@ -283,11 +280,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB5_6: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB5_1 Depth=1 -; LA64-NEXT: addi.w $a7, $a2, 0 -; LA64-NEXT: move $a2, $a6 -; LA64-NEXT: bne $a6, $a7, .LBB5_1 +; LA64-NEXT: bne $a2, $a7, .LBB5_1 ; LA64-NEXT: # %bb.2: # %atomicrmw.end -; LA64-NEXT: srl.w $a0, $a6, $a3 +; LA64-NEXT: srl.w $a0, $a2, $a3 ; LA64-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst ret i16 %result @@ -296,22 +291,22 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; LA64-LABEL: atomicrmw_udec_wrap_i32: ; LA64: # %bb.0: -; LA64-NEXT: ld.w $a4, $a0, 0 +; LA64-NEXT: ld.w $a2, $a0, 0 ; LA64-NEXT: addi.w $a3, $a1, 0 ; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB6_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB6_3 Depth 2 -; LA64-NEXT: addi.w $a5, $a4, 0 -; LA64-NEXT: sltu $a2, $a3, $a5 -; LA64-NEXT: addi.d $a6, $a4, -1 -; LA64-NEXT: masknez $a6, $a6, $a2 -; LA64-NEXT: maskeqz $a2, $a1, $a2 -; LA64-NEXT: or $a2, $a2, $a6 -; LA64-NEXT: sltui $a6, $a5, 1 -; LA64-NEXT: masknez $a2, $a2, $a6 -; LA64-NEXT: maskeqz $a6, $a1, $a6 -; LA64-NEXT: or $a6, $a6, $a2 +; LA64-NEXT: addi.w $a4, $a2, 0 +; LA64-NEXT: sltu $a5, $a3, $a4 +; LA64-NEXT: addi.d $a2, $a2, -1 +; LA64-NEXT: masknez $a2, $a2, $a5 +; LA64-NEXT: maskeqz $a5, $a1, $a5 +; LA64-NEXT: or $a2, $a5, $a2 +; LA64-NEXT: sltui $a5, $a4, 1 +; LA64-NEXT: masknez $a2, $a2, $a5 +; LA64-NEXT: maskeqz $a5, $a1, $a5 +; LA64-NEXT: or $a5, $a5, $a2 ; LA64-NEXT: .LBB6_3: # %atomicrmw.start ; LA64-NEXT: # Parent Loop BB6_1 Depth=1 ; LA64-NEXT: # => This Inner Loop Header: Depth=2 @@ -319,17 +314,16 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; LA64-NEXT: bne $a2, $a4, .LBB6_5 ; LA64-NEXT: # %bb.4: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB6_3 Depth=2 -; LA64-NEXT: move $a7, $a6 -; LA64-NEXT: sc.w $a7, $a0, 0 -; LA64-NEXT: beqz $a7, .LBB6_3 +; LA64-NEXT: move $a6, $a5 +; LA64-NEXT: sc.w $a6, $a0, 0 +; LA64-NEXT: beqz $a6, .LBB6_3 ; LA64-NEXT: b .LBB6_6 ; LA64-NEXT: .LBB6_5: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB6_1 Depth=1 ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB6_6: # %atomicrmw.start ; LA64-NEXT: # in Loop: Header=BB6_1 Depth=1 -; LA64-NEXT: move $a4, $a2 -; LA64-NEXT: bne $a2, $a5, .LBB6_1 +; LA64-NEXT: bne $a2, $a4, .LBB6_1 ; LA64-NEXT: # %bb.2: # %atomicrmw.end ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll index dfbe000841cdcb..e008caacad2a17 100644 --- a/llvm/test/CodeGen/LoongArch/bstrins_w.ll +++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll @@ -145,6 +145,19 @@ define i32 @pat5(i32 %a) nounwind { ret i32 %or } +;; The high bits of `const` are zero. +define i32 @pat5_high_zeros(i32 %a) nounwind { +; CHECK-LABEL: pat5_high_zeros: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 1 +; CHECK-NEXT: ori $a1, $a1, 564 +; CHECK-NEXT: bstrins.w $a0, $a1, 31, 16 +; CHECK-NEXT: ret + %and = and i32 %a, 65535 ; 0x0000ffff + %or = or i32 %and, 305397760 ; 0x12340000 + ret i32 %or +} + ;; Pattern 6: a = b | ((c & mask) << shamt) ;; In this testcase b is 0x10000002, but in fact we do not require b being a ;; constant. As long as all positions in b to be overwritten by the incoming diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll index 417c865f6383ff..31ecec6ea8051b 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll @@ -69,6 +69,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind define void @cmpxchg_i32_acquire_acquire(ptr %ptr, i32 %cmp, i32 %val) nounwind { ; LA64-LABEL: cmpxchg_i32_acquire_acquire: ; LA64: # %bb.0: +; LA64-NEXT: addi.w $a1, $a1, 0 ; LA64-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; LA64-NEXT: ll.w $a3, $a0, 0 ; LA64-NEXT: bne $a3, $a1, .LBB2_3 @@ -172,6 +173,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin define void @cmpxchg_i32_acquire_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind { ; LA64-LABEL: cmpxchg_i32_acquire_monotonic: ; LA64: # %bb.0: +; LA64-NEXT: addi.w $a1, $a1, 0 ; LA64-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; LA64-NEXT: ll.w $a3, $a0, 0 ; LA64-NEXT: bne $a3, $a1, .LBB6_3 @@ -279,9 +281,10 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind { ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti32: ; LA64: # %bb.0: +; LA64-NEXT: addi.w $a3, $a1, 0 ; LA64-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 -; LA64-NEXT: ll.w $a3, $a0, 0 -; LA64-NEXT: bne $a3, $a1, .LBB10_3 +; LA64-NEXT: ll.w $a1, $a0, 0 +; LA64-NEXT: bne $a1, $a3, .LBB10_3 ; LA64-NEXT: # %bb.2: # in Loop: Header=BB10_1 Depth=1 ; LA64-NEXT: move $a4, $a2 ; LA64-NEXT: sc.w $a4, $a0, 0 @@ -290,7 +293,7 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou ; LA64-NEXT: .LBB10_3: ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB10_4: -; LA64-NEXT: move $a0, $a3 +; LA64-NEXT: move $a0, $a1 ; LA64-NEXT: ret %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire %res = extractvalue { i32, i1 } %tmp, 0 @@ -396,6 +399,7 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind { ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti1: ; LA64: # %bb.0: +; LA64-NEXT: addi.w $a1, $a1, 0 ; LA64-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; LA64-NEXT: ll.w $a3, $a0, 0 ; LA64-NEXT: bne $a3, $a1, .LBB14_3 @@ -407,8 +411,7 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw ; LA64-NEXT: .LBB14_3: ; LA64-NEXT: dbar 20 ; LA64-NEXT: .LBB14_4: -; LA64-NEXT: addi.w $a0, $a1, 0 -; LA64-NEXT: xor $a0, $a3, $a0 +; LA64-NEXT: xor $a0, $a3, $a1 ; LA64-NEXT: sltui $a0, $a0, 1 ; LA64-NEXT: ret %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire @@ -506,6 +509,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw define void @cmpxchg_i32_monotonic_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind { ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic: ; LA64: # %bb.0: +; LA64-NEXT: addi.w $a1, $a1, 0 ; LA64-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; LA64-NEXT: ll.w $a3, $a0, 0 ; LA64-NEXT: bne $a3, $a1, .LBB18_3 @@ -613,9 +617,10 @@ define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val) define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind { ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti32: ; LA64: # %bb.0: +; LA64-NEXT: addi.w $a3, $a1, 0 ; LA64-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 -; LA64-NEXT: ll.w $a3, $a0, 0 -; LA64-NEXT: bne $a3, $a1, .LBB22_3 +; LA64-NEXT: ll.w $a1, $a0, 0 +; LA64-NEXT: bne $a1, $a3, .LBB22_3 ; LA64-NEXT: # %bb.2: # in Loop: Header=BB22_1 Depth=1 ; LA64-NEXT: move $a4, $a2 ; LA64-NEXT: sc.w $a4, $a0, 0 @@ -624,7 +629,7 @@ define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val) ; LA64-NEXT: .LBB22_3: ; LA64-NEXT: dbar 1792 ; LA64-NEXT: .LBB22_4: -; LA64-NEXT: move $a0, $a3 +; LA64-NEXT: move $a0, $a1 ; LA64-NEXT: ret %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic %res = extractvalue { i32, i1 } %tmp, 0 @@ -730,6 +735,7 @@ define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) n define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind { ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti1: ; LA64: # %bb.0: +; LA64-NEXT: addi.w $a1, $a1, 0 ; LA64-NEXT: .LBB26_1: # =>This Inner Loop Header: Depth=1 ; LA64-NEXT: ll.w $a3, $a0, 0 ; LA64-NEXT: bne $a3, $a1, .LBB26_3 @@ -741,8 +747,7 @@ define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) n ; LA64-NEXT: .LBB26_3: ; LA64-NEXT: dbar 1792 ; LA64-NEXT: .LBB26_4: -; LA64-NEXT: addi.w $a0, $a1, 0 -; LA64-NEXT: xor $a0, $a3, $a0 +; LA64-NEXT: xor $a0, $a3, $a1 ; LA64-NEXT: sltui $a0, $a0, 1 ; LA64-NEXT: ret %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll index 589360823b1488..4d8160d7080340 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll @@ -16,6 +16,7 @@ define float @float_fadd_acquire(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB0_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB0_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -33,8 +34,7 @@ define float @float_fadd_acquire(ptr %p) nounwind { ; LA64F-NEXT: .LBB0_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB0_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB0_1 +; LA64F-NEXT: bne $a3, $a2, .LBB0_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -51,6 +51,7 @@ define float @float_fadd_acquire(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB0_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB0_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -68,8 +69,7 @@ define float @float_fadd_acquire(ptr %p) nounwind { ; LA64D-NEXT: .LBB0_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB0_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB0_1 +; LA64D-NEXT: bne $a3, $a2, .LBB0_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, float 1.0 acquire, align 4 @@ -90,6 +90,7 @@ define float @float_fsub_acquire(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB1_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB1_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -107,8 +108,7 @@ define float @float_fsub_acquire(ptr %p) nounwind { ; LA64F-NEXT: .LBB1_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB1_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB1_1 +; LA64F-NEXT: bne $a3, $a2, .LBB1_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -125,6 +125,7 @@ define float @float_fsub_acquire(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB1_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB1_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -142,8 +143,7 @@ define float @float_fsub_acquire(ptr %p) nounwind { ; LA64D-NEXT: .LBB1_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB1_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB1_1 +; LA64D-NEXT: bne $a3, $a2, .LBB1_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, float 1.0 acquire, align 4 @@ -165,6 +165,7 @@ define float @float_fmin_acquire(ptr %p) nounwind { ; LA64F-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB2_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB2_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -182,8 +183,7 @@ define float @float_fmin_acquire(ptr %p) nounwind { ; LA64F-NEXT: .LBB2_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB2_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB2_1 +; LA64F-NEXT: bne $a3, $a2, .LBB2_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -201,6 +201,7 @@ define float @float_fmin_acquire(ptr %p) nounwind { ; LA64D-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB2_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB2_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -218,8 +219,7 @@ define float @float_fmin_acquire(ptr %p) nounwind { ; LA64D-NEXT: .LBB2_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB2_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB2_1 +; LA64D-NEXT: bne $a3, $a2, .LBB2_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, float 1.0 acquire, align 4 @@ -241,6 +241,7 @@ define float @float_fmax_acquire(ptr %p) nounwind { ; LA64F-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB3_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB3_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -258,8 +259,7 @@ define float @float_fmax_acquire(ptr %p) nounwind { ; LA64F-NEXT: .LBB3_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB3_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB3_1 +; LA64F-NEXT: bne $a3, $a2, .LBB3_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -277,6 +277,7 @@ define float @float_fmax_acquire(ptr %p) nounwind { ; LA64D-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB3_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB3_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -294,8 +295,7 @@ define float @float_fmax_acquire(ptr %p) nounwind { ; LA64D-NEXT: .LBB3_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB3_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB3_1 +; LA64D-NEXT: bne $a3, $a2, .LBB3_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, float 1.0 acquire, align 4 @@ -694,6 +694,7 @@ define float @float_fadd_release(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB8_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB8_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -711,8 +712,7 @@ define float @float_fadd_release(ptr %p) nounwind { ; LA64F-NEXT: .LBB8_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB8_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB8_1 +; LA64F-NEXT: bne $a3, $a2, .LBB8_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -729,6 +729,7 @@ define float @float_fadd_release(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB8_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB8_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -746,8 +747,7 @@ define float @float_fadd_release(ptr %p) nounwind { ; LA64D-NEXT: .LBB8_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB8_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB8_1 +; LA64D-NEXT: bne $a3, $a2, .LBB8_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, float 1.0 release, align 4 @@ -768,6 +768,7 @@ define float @float_fsub_release(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB9_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB9_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -785,8 +786,7 @@ define float @float_fsub_release(ptr %p) nounwind { ; LA64F-NEXT: .LBB9_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB9_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB9_1 +; LA64F-NEXT: bne $a3, $a2, .LBB9_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -803,6 +803,7 @@ define float @float_fsub_release(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB9_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB9_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -820,8 +821,7 @@ define float @float_fsub_release(ptr %p) nounwind { ; LA64D-NEXT: .LBB9_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB9_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB9_1 +; LA64D-NEXT: bne $a3, $a2, .LBB9_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, float 1.0 release, align 4 @@ -843,6 +843,7 @@ define float @float_fmin_release(ptr %p) nounwind { ; LA64F-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB10_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB10_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -860,8 +861,7 @@ define float @float_fmin_release(ptr %p) nounwind { ; LA64F-NEXT: .LBB10_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB10_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB10_1 +; LA64F-NEXT: bne $a3, $a2, .LBB10_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -879,6 +879,7 @@ define float @float_fmin_release(ptr %p) nounwind { ; LA64D-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB10_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB10_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -896,8 +897,7 @@ define float @float_fmin_release(ptr %p) nounwind { ; LA64D-NEXT: .LBB10_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB10_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB10_1 +; LA64D-NEXT: bne $a3, $a2, .LBB10_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, float 1.0 release, align 4 @@ -919,6 +919,7 @@ define float @float_fmax_release(ptr %p) nounwind { ; LA64F-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB11_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB11_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -936,8 +937,7 @@ define float @float_fmax_release(ptr %p) nounwind { ; LA64F-NEXT: .LBB11_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB11_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB11_1 +; LA64F-NEXT: bne $a3, $a2, .LBB11_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -955,6 +955,7 @@ define float @float_fmax_release(ptr %p) nounwind { ; LA64D-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB11_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB11_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -972,8 +973,7 @@ define float @float_fmax_release(ptr %p) nounwind { ; LA64D-NEXT: .LBB11_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB11_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB11_1 +; LA64D-NEXT: bne $a3, $a2, .LBB11_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, float 1.0 release, align 4 @@ -1372,6 +1372,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB16_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB16_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -1389,8 +1390,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: .LBB16_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB16_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB16_1 +; LA64F-NEXT: bne $a3, $a2, .LBB16_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -1407,6 +1407,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB16_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB16_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -1424,8 +1425,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: .LBB16_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB16_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB16_1 +; LA64D-NEXT: bne $a3, $a2, .LBB16_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, float 1.0 acq_rel, align 4 @@ -1446,6 +1446,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB17_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB17_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -1463,8 +1464,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: .LBB17_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB17_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB17_1 +; LA64F-NEXT: bne $a3, $a2, .LBB17_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -1481,6 +1481,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB17_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB17_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -1498,8 +1499,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: .LBB17_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB17_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB17_1 +; LA64D-NEXT: bne $a3, $a2, .LBB17_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, float 1.0 acq_rel, align 4 @@ -1521,6 +1521,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB18_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB18_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -1538,8 +1539,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: .LBB18_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB18_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB18_1 +; LA64F-NEXT: bne $a3, $a2, .LBB18_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -1557,6 +1557,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB18_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB18_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -1574,8 +1575,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: .LBB18_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB18_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB18_1 +; LA64D-NEXT: bne $a3, $a2, .LBB18_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, float 1.0 acq_rel, align 4 @@ -1597,6 +1597,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB19_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB19_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -1614,8 +1615,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind { ; LA64F-NEXT: .LBB19_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB19_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB19_1 +; LA64F-NEXT: bne $a3, $a2, .LBB19_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -1633,6 +1633,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB19_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB19_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -1650,8 +1651,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind { ; LA64D-NEXT: .LBB19_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB19_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB19_1 +; LA64D-NEXT: bne $a3, $a2, .LBB19_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, float 1.0 acq_rel, align 4 @@ -2074,6 +2074,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB24_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB24_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2091,8 +2092,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: .LBB24_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB24_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB24_1 +; LA64F-NEXT: bne $a3, $a2, .LBB24_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -2109,6 +2109,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB24_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB24_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -2126,8 +2127,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: .LBB24_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB24_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB24_1 +; LA64D-NEXT: bne $a3, $a2, .LBB24_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, float 1.0 seq_cst, align 4 @@ -2148,6 +2148,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB25_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB25_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2165,8 +2166,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: .LBB25_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB25_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB25_1 +; LA64F-NEXT: bne $a3, $a2, .LBB25_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -2183,6 +2183,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB25_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB25_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -2200,8 +2201,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: .LBB25_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB25_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB25_1 +; LA64D-NEXT: bne $a3, $a2, .LBB25_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, float 1.0 seq_cst, align 4 @@ -2223,6 +2223,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB26_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB26_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2240,8 +2241,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: .LBB26_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB26_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB26_1 +; LA64F-NEXT: bne $a3, $a2, .LBB26_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -2259,6 +2259,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB26_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB26_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -2276,8 +2277,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: .LBB26_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB26_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB26_1 +; LA64D-NEXT: bne $a3, $a2, .LBB26_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, float 1.0 seq_cst, align 4 @@ -2299,6 +2299,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB27_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB27_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2316,8 +2317,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind { ; LA64F-NEXT: .LBB27_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB27_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB27_1 +; LA64F-NEXT: bne $a3, $a2, .LBB27_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -2335,6 +2335,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB27_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB27_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -2352,8 +2353,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind { ; LA64D-NEXT: .LBB27_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB27_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB27_1 +; LA64D-NEXT: bne $a3, $a2, .LBB27_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, float 1.0 seq_cst, align 4 @@ -2752,6 +2752,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB32_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB32_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2769,8 +2770,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind { ; LA64F-NEXT: .LBB32_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB32_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB32_1 +; LA64F-NEXT: bne $a3, $a2, .LBB32_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -2787,6 +2787,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB32_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB32_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -2804,8 +2805,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind { ; LA64D-NEXT: .LBB32_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB32_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB32_1 +; LA64D-NEXT: bne $a3, $a2, .LBB32_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, float 1.0 monotonic, align 4 @@ -2826,6 +2826,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind { ; LA64F-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB33_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB33_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2843,8 +2844,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind { ; LA64F-NEXT: .LBB33_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB33_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB33_1 +; LA64F-NEXT: bne $a3, $a2, .LBB33_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -2861,6 +2861,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind { ; LA64D-NEXT: fadd.s $fa2, $fa0, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB33_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB33_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -2878,8 +2879,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind { ; LA64D-NEXT: .LBB33_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB33_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB33_1 +; LA64D-NEXT: bne $a3, $a2, .LBB33_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, float 1.0 monotonic, align 4 @@ -2901,6 +2901,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind { ; LA64F-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB34_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB34_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2918,8 +2919,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind { ; LA64F-NEXT: .LBB34_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB34_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB34_1 +; LA64F-NEXT: bne $a3, $a2, .LBB34_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -2937,6 +2937,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind { ; LA64D-NEXT: fmin.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB34_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB34_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -2954,8 +2955,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind { ; LA64D-NEXT: .LBB34_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB34_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB34_1 +; LA64D-NEXT: bne $a3, $a2, .LBB34_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, float 1.0 monotonic, align 4 @@ -2977,6 +2977,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind { ; LA64F-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64F-NEXT: movfr2gr.s $a1, $fa2 ; LA64F-NEXT: movfr2gr.s $a2, $fa0 +; LA64F-NEXT: addi.w $a2, $a2, 0 ; LA64F-NEXT: .LBB35_3: # %atomicrmw.start ; LA64F-NEXT: # Parent Loop BB35_1 Depth=1 ; LA64F-NEXT: # => This Inner Loop Header: Depth=2 @@ -2994,8 +2995,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind { ; LA64F-NEXT: .LBB35_6: # %atomicrmw.start ; LA64F-NEXT: # in Loop: Header=BB35_1 Depth=1 ; LA64F-NEXT: movgr2fr.w $fa0, $a3 -; LA64F-NEXT: addi.w $a1, $a2, 0 -; LA64F-NEXT: bne $a3, $a1, .LBB35_1 +; LA64F-NEXT: bne $a3, $a2, .LBB35_1 ; LA64F-NEXT: # %bb.2: # %atomicrmw.end ; LA64F-NEXT: ret ; @@ -3013,6 +3013,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind { ; LA64D-NEXT: fmax.s $fa2, $fa2, $fa1 ; LA64D-NEXT: movfr2gr.s $a1, $fa2 ; LA64D-NEXT: movfr2gr.s $a2, $fa0 +; LA64D-NEXT: addi.w $a2, $a2, 0 ; LA64D-NEXT: .LBB35_3: # %atomicrmw.start ; LA64D-NEXT: # Parent Loop BB35_1 Depth=1 ; LA64D-NEXT: # => This Inner Loop Header: Depth=2 @@ -3030,8 +3031,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind { ; LA64D-NEXT: .LBB35_6: # %atomicrmw.start ; LA64D-NEXT: # in Loop: Header=BB35_1 Depth=1 ; LA64D-NEXT: movgr2fr.w $fa0, $a3 -; LA64D-NEXT: addi.w $a1, $a2, 0 -; LA64D-NEXT: bne $a3, $a1, .LBB35_1 +; LA64D-NEXT: bne $a3, $a2, .LBB35_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, float 1.0 monotonic, align 4 diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir new file mode 100644 index 00000000000000..9bbb579b762e63 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir @@ -0,0 +1,213 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -o - %s -mtriple=loongarch64 \ +# RUN: -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s + +--- +name: foo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $r4, $r5, $r6, $r7, $r8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $r8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $r7 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $r6 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $r5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr = COPY $r4 + ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1 + ; CHECK-NEXT: [[ORI:%[0-9]+]]:gpr = ORI $r0, 1 + ; CHECK-NEXT: [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1 + ; CHECK-NEXT: [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1 + ; CHECK-NEXT: [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQZ [[ANDI]], %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: PseudoBR %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.7(0x7c000000), %bb.6(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8 + ; CHECK-NEXT: dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0 + ; CHECK-NEXT: BNEZ [[ANDI1]], %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6: + ; CHECK-NEXT: successors: %bb.11(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NEXT: PseudoBR %bb.11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: successors: %bb.8(0x7c000000), %bb.10(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQZ [[ANDI2]], %bb.10 + ; CHECK-NEXT: PseudoBR %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8: + ; CHECK-NEXT: successors: %bb.9(0x04000000), %bb.5(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 + ; CHECK-NEXT: BEQZ [[ANDI3]], %bb.5 + ; CHECK-NEXT: PseudoBR %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_B $r0, [[COPY4]], 0 + ; CHECK-NEXT: PseudoBR %bb.12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10: + ; CHECK-NEXT: successors: %bb.11(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_D $r0, [[COPY4]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12: + ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.1(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BEQ [[COPY7]], [[ORI]], %bb.2 + ; CHECK-NEXT: PseudoBR %bb.1 + bb.0: + liveins: $r4, $r5, $r6, $r7, $r8 + + %0:gpr = COPY killed $r8 + %1:gpr = COPY killed $r7 + %2:gpr = COPY killed $r6 + %3:gpr = COPY killed $r5 + %4:gpr = COPY killed $r4 + %5:gpr = COPY $r0 + %6:gpr = COPY killed %5 + %7:gpr = ANDI killed %3, 1 + %8:gpr = ORI $r0, 1 + %9:gpr = ANDI killed %2, 1 + %10:gpr = ANDI killed %1, 1 + %11:gpr = ANDI killed %0, 1 + %12:gpr = COPY %6 + %13:gpr = COPY killed %6 + %14:gpr = IMPLICIT_DEF + + bb.1: + %15:gpr = COPY killed %14 + %16:gpr = COPY killed %13 + %17:gpr = COPY killed %12 + %18:gpr = COPY %17 + %19:gpr = COPY %16 + %20:gpr = COPY killed %16 + %21:gpr = COPY killed %15 + + bb.2: + successors: %bb.3, %bb.4 + + %22:gpr = COPY killed %21 + %23:gpr = COPY killed %20 + %24:gpr = COPY killed %19 + %25:gpr = COPY killed %18 + BEQZ %7, %bb.4 + + bb.3: + %26:gpr = COPY killed %24 + %27:gpr = COPY killed %23 + PseudoBR %bb.9 + + bb.4: + %28:gpr = COPY killed %23 + + bb.5: + successors: %bb.7(0x7c000000), %bb.6(0x04000000) + + %29:gpr = COPY killed %28 + dead %30:gpr = LD_D $r0, 8 + dead %31:gpr = LD_D $r0, 0 + BNEZ %9, %bb.7 + + bb.6: + %32:gpr = COPY $r0 + %33:gpr = COPY killed %32 + %34:gpr = COPY killed %33 + %35:gpr = COPY killed %22 + PseudoBR %bb.11 + + bb.7: + successors: %bb.8(0x7c000000), %bb.10(0x04000000) + + BEQZ %10, %bb.10 + PseudoBR %bb.8 + + bb.8: + successors: %bb.9(0x04000000), %bb.5(0x7c000000) + + %36:gpr = ADDI_D killed %29, 1 + %28:gpr = COPY %36 + %26:gpr = COPY %36 + %27:gpr = COPY killed %36 + BEQZ %11, %bb.5 + PseudoBR %bb.9 + + bb.9: + %37:gpr = COPY killed %27 + %38:gpr = COPY killed %26 + %39:gpr = COPY $r0 + ST_B killed %39, %4, 0 + %40:gpr = COPY killed %25 + %41:gpr = COPY killed %38 + %42:gpr = COPY killed %37 + %43:gpr = COPY killed %22 + PseudoBR %bb.12 + + bb.10: + %44:gpr = ADDI_D killed %29, 1 + %34:gpr = COPY %8 + %35:gpr = COPY killed %44 + + bb.11: + %45:gpr = COPY killed %35 + %46:gpr = COPY killed %34 + %47:gpr = COPY $r0 + ST_D killed %47, %4, 0 + %40:gpr = COPY %45 + %41:gpr = COPY %46 + %42:gpr = COPY killed %46 + %43:gpr = COPY killed %45 + + bb.12: + successors: %bb.2(0x7c000000), %bb.1(0x04000000) + + %48:gpr = COPY killed %43 + %49:gpr = COPY killed %42 + %50:gpr = COPY killed %41 + %51:gpr = COPY killed %40 + %12:gpr = COPY %51 + %13:gpr = COPY %50 + %14:gpr = COPY %48 + %18:gpr = COPY killed %51 + %19:gpr = COPY killed %50 + %20:gpr = COPY killed %49 + %21:gpr = COPY killed %48 + BEQ %17, %8, %bb.2 + PseudoBR %bb.1 + +... diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll b/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll index 9e64d7b2fa039b..c276515920d52a 100644 --- a/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll +++ b/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll @@ -11,7 +11,7 @@ define void @test(i32 %x, ptr %p) nounwind { ; CHECK-NEXT: andi $1, $4, 1 ; CHECK-NEXT: bgtz $1, $BB0_1 ; CHECK-NEXT: nop -; CHECK-NEXT: # %bb.1: # %foo +; CHECK-NEXT: $BB0_1: # %foo ; CHECK-NEXT: jr $ra ; CHECK-NEXT: nop %y = and i32 %x, 1 diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll index 4c10fedaa4a884..74765ff0e8f106 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll +++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll @@ -25,7 +25,7 @@ define i32 @mod4_0_to_11(i32 %a) { ; MIPS32-NEXT: sltu $1, $1, $2 ; MIPS32-NEXT: bnez $1, $BB0_6 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_1: # %entry +; MIPS32-NEXT: # %bb.1: # %entry ; MIPS32-NEXT: lw $2, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lui $1, %hi($JTI0_0) ; MIPS32-NEXT: sll $2, $2, 2 @@ -65,7 +65,7 @@ define i32 @mod4_0_to_11(i32 %a) { ; MIPS32-NEXT: sltu $1, $1, $2 ; MIPS32-NEXT: bnez $1, $BB0_13 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_8: # %sw.epilog +; MIPS32-NEXT: # %bb.8: # %sw.epilog ; MIPS32-NEXT: lw $2, 0($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lui $1, %hi($JTI0_1) ; MIPS32-NEXT: sll $2, $2, 2 @@ -125,7 +125,7 @@ define i32 @mod4_0_to_11(i32 %a) { ; MIPS32_PIC-NEXT: sltu $1, $1, $2 ; MIPS32_PIC-NEXT: bnez $1, $BB0_6 ; MIPS32_PIC-NEXT: nop -; MIPS32_PIC-NEXT: $BB0_1: # %entry +; MIPS32_PIC-NEXT: # %bb.1: # %entry ; MIPS32_PIC-NEXT: lw $2, 8($sp) # 4-byte Folded Reload ; MIPS32_PIC-NEXT: lw $3, 36($sp) # 4-byte Folded Reload ; MIPS32_PIC-NEXT: lw $1, %got($JTI0_0)($2) @@ -167,7 +167,7 @@ define i32 @mod4_0_to_11(i32 %a) { ; MIPS32_PIC-NEXT: sltu $1, $1, $2 ; MIPS32_PIC-NEXT: bnez $1, $BB0_13 ; MIPS32_PIC-NEXT: nop -; MIPS32_PIC-NEXT: $BB0_8: # %sw.epilog +; MIPS32_PIC-NEXT: # %bb.8: # %sw.epilog ; MIPS32_PIC-NEXT: lw $2, 8($sp) # 4-byte Folded Reload ; MIPS32_PIC-NEXT: lw $3, 4($sp) # 4-byte Folded Reload ; MIPS32_PIC-NEXT: lw $1, %got($JTI0_1)($2) diff --git a/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll b/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll index cbd8b2370ac969..de61515cff9bcb 100644 --- a/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll +++ b/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll @@ -18,7 +18,7 @@ sw.bb: ; preds = %entry br label %sw.epilog ; CHECK: beqzc ; CHECK-NEXT: nop -; CHECK-NEXT: .LBB +; CHECK-NEXT: # %bb.1 ; CHECK-NEXT: j sw.bb1: ; preds = %entry, %entry @@ -26,7 +26,7 @@ sw.bb1: ; preds = %entry, %entry br label %sw.epilog ; CHECK: bnezc ; CHECK-NEXT: nop -; CHECK-NEXT: .LBB +; CHECK-NEXT: # %bb.3 ; CHECK-NEXT: j sw.epilog: ; preds = %entry, %sw.bb1, %sw.bb diff --git a/llvm/test/CodeGen/Mips/hf1_body.ll b/llvm/test/CodeGen/Mips/hf1_body.ll index 184ea31bddc9d2..c3dea67896210a 100644 --- a/llvm/test/CodeGen/Mips/hf1_body.ll +++ b/llvm/test/CodeGen/Mips/hf1_body.ll @@ -23,8 +23,8 @@ entry: ; ALL: .set reorder ; ALL: .reloc 0, R_MIPS_NONE, v_sf ; GAS: la $25, $__fn_local_v_sf -; IAS: lw $25, %got($$__fn_local_v_sf)($gp) -; IAS: addiu $25, $25, %lo($$__fn_local_v_sf) +; IAS: lw $25, %got($__fn_local_v_sf)($gp) +; IAS: addiu $25, $25, %lo($__fn_local_v_sf) ; ALL: mfc1 $4, $f12 ; ALL: jr $25 ; ALL: .end __fn_stub_v_sf diff --git a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll index 1ce46cfa07cf87..634216027ef6bb 100644 --- a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll +++ b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll @@ -42,7 +42,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; MIPS32R2-NEXT: sltiu $1, $4, 7 ; MIPS32R2-NEXT: beqz $1, $BB0_6 ; MIPS32R2-NEXT: sw $4, 4($sp) -; MIPS32R2-NEXT: $BB0_1: # %entry +; MIPS32R2-NEXT: # %bb.1: # %entry ; MIPS32R2-NEXT: sll $1, $4, 2 ; MIPS32R2-NEXT: lui $2, %hi($JTI0_0) ; MIPS32R2-NEXT: addu $1, $1, $2 @@ -100,7 +100,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; MIPS32R6-NEXT: sltiu $1, $4, 7 ; MIPS32R6-NEXT: beqz $1, $BB0_6 ; MIPS32R6-NEXT: sw $4, 4($sp) -; MIPS32R6-NEXT: $BB0_1: # %entry +; MIPS32R6-NEXT: # %bb.1: # %entry ; MIPS32R6-NEXT: sll $1, $4, 2 ; MIPS32R6-NEXT: lui $2, %hi($JTI0_0) ; MIPS32R6-NEXT: addu $1, $1, $2 @@ -159,7 +159,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; MIPS64R2-NEXT: sltiu $1, $2, 7 ; MIPS64R2-NEXT: beqz $1, .LBB0_6 ; MIPS64R2-NEXT: sw $4, 4($sp) -; MIPS64R2-NEXT: .LBB0_1: # %entry +; MIPS64R2-NEXT: # %bb.1: # %entry ; MIPS64R2-NEXT: dsll $1, $2, 3 ; MIPS64R2-NEXT: lui $2, %highest(.LJTI0_0) ; MIPS64R2-NEXT: daddiu $2, $2, %higher(.LJTI0_0) @@ -254,7 +254,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; MIPS64R6-NEXT: sltiu $1, $2, 7 ; MIPS64R6-NEXT: beqz $1, .LBB0_6 ; MIPS64R6-NEXT: sw $4, 4($sp) -; MIPS64R6-NEXT: .LBB0_1: # %entry +; MIPS64R6-NEXT: # %bb.1: # %entry ; MIPS64R6-NEXT: dsll $1, $2, 3 ; MIPS64R6-NEXT: lui $2, %highest(.LJTI0_0) ; MIPS64R6-NEXT: daddiu $2, $2, %higher(.LJTI0_0) @@ -351,7 +351,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; PIC-MIPS32R2-NEXT: sltiu $1, $4, 7 ; PIC-MIPS32R2-NEXT: beqz $1, $BB0_6 ; PIC-MIPS32R2-NEXT: sw $4, 4($sp) -; PIC-MIPS32R2-NEXT: $BB0_1: # %entry +; PIC-MIPS32R2-NEXT: # %bb.1: # %entry ; PIC-MIPS32R2-NEXT: sll $1, $4, 2 ; PIC-MIPS32R2-NEXT: lw $3, %got($JTI0_0)($2) ; PIC-MIPS32R2-NEXT: addu $1, $1, $3 @@ -413,7 +413,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; PIC-MIPS32R6-NEXT: sltiu $1, $4, 7 ; PIC-MIPS32R6-NEXT: beqz $1, $BB0_6 ; PIC-MIPS32R6-NEXT: sw $4, 4($sp) -; PIC-MIPS32R6-NEXT: $BB0_1: # %entry +; PIC-MIPS32R6-NEXT: # %bb.1: # %entry ; PIC-MIPS32R6-NEXT: sll $1, $4, 2 ; PIC-MIPS32R6-NEXT: lw $3, %got($JTI0_0)($2) ; PIC-MIPS32R6-NEXT: addu $1, $1, $3 @@ -476,7 +476,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; PIC-MIPS64R2-NEXT: sltiu $1, $3, 7 ; PIC-MIPS64R2-NEXT: beqz $1, .LBB0_6 ; PIC-MIPS64R2-NEXT: sw $4, 4($sp) -; PIC-MIPS64R2-NEXT: .LBB0_1: # %entry +; PIC-MIPS64R2-NEXT: # %bb.1: # %entry ; PIC-MIPS64R2-NEXT: dsll $1, $3, 3 ; PIC-MIPS64R2-NEXT: ld $3, %got_page(.LJTI0_0)($2) ; PIC-MIPS64R2-NEXT: daddu $1, $1, $3 @@ -539,7 +539,7 @@ define ptr @_Z3fooi(i32 signext %Letter) { ; PIC-MIPS64R6-NEXT: sltiu $1, $3, 7 ; PIC-MIPS64R6-NEXT: beqz $1, .LBB0_6 ; PIC-MIPS64R6-NEXT: sw $4, 4($sp) -; PIC-MIPS64R6-NEXT: .LBB0_1: # %entry +; PIC-MIPS64R6-NEXT: # %bb.1: # %entry ; PIC-MIPS64R6-NEXT: dsll $1, $3, 3 ; PIC-MIPS64R6-NEXT: ld $3, %got_page(.LJTI0_0)($2) ; PIC-MIPS64R6-NEXT: daddu $1, $1, $3 diff --git a/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll new file mode 100644 index 00000000000000..705570f808ce00 --- /dev/null +++ b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -march=mips < %s | FileCheck %s --check-prefix=MIPS32 +; RUN: llc -march=mips64 < %s | FileCheck %s --check-prefix=MIPS64 + +define dso_local void @read_double(ptr nocapture noundef readonly %0) local_unnamed_addr #0 { +; MIPS32-LABEL: read_double: +; MIPS32: # %bb.0: +; MIPS32-NEXT: lw $2, 4($4) +; MIPS32-NEXT: lw $3, 0($4) +; MIPS32-NEXT: #APP +; MIPS32-NEXT: #NO_APP +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: nop +; +; MIPS64-LABEL: read_double: +; MIPS64: # %bb.0: +; MIPS64-NEXT: ld $2, 0($4) +; MIPS64-NEXT: #APP +; MIPS64-NEXT: #NO_APP +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: nop + %2 = load double, ptr %0, align 8 + tail call void asm sideeffect "", "r,~{$1}"(double %2) + ret void +} + +define dso_local void @read_float(ptr nocapture noundef readonly %0) local_unnamed_addr #0 { +; MIPS32-LABEL: read_float: +; MIPS32: # %bb.0: +; MIPS32-NEXT: lw $2, 0($4) +; MIPS32-NEXT: #APP +; MIPS32-NEXT: #NO_APP +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: nop +; +; MIPS64-LABEL: read_float: +; MIPS64: # %bb.0: +; MIPS64-NEXT: lw $2, 0($4) +; MIPS64-NEXT: #APP +; MIPS64-NEXT: #NO_APP +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: nop + %2 = load float, ptr %0, align 8 + tail call void asm sideeffect "", "r,~{$1}"(float %2) + ret void +} + +attributes #0 = { "target-features"="+soft-float" "use-soft-float"="true" } diff --git a/llvm/test/CodeGen/Mips/jump-table-mul.ll b/llvm/test/CodeGen/Mips/jump-table-mul.ll index 22f41f53d154bf..cca6080a07544c 100644 --- a/llvm/test/CodeGen/Mips/jump-table-mul.ll +++ b/llvm/test/CodeGen/Mips/jump-table-mul.ll @@ -10,7 +10,7 @@ define i64 @test(i64 %arg) { ; CHECK-NEXT: sltiu $1, $4, 11 ; CHECK-NEXT: beqz $1, .LBB0_4 ; CHECK-NEXT: nop -; CHECK-NEXT: .LBB0_1: # %entry +; CHECK-NEXT: # %bb.1: # %entry ; CHECK-NEXT: daddiu $1, $2, %lo(%neg(%gp_rel(test))) ; CHECK-NEXT: dsll $2, $4, 3 ; CHECK-NEXT: ld $3, %got_page(.LJTI0_0)($1) diff --git a/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll b/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll new file mode 100644 index 00000000000000..3e6826f0cd1d13 --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll @@ -0,0 +1,71 @@ +target triple = "mipsisa32r6el-unknown-linux-gnu" + +; RUN: llc -filetype=asm %s -o - | FileCheck %s --check-prefix=MIPSELR6 +; Function Attrs: noinline nounwind optnone uwtable +define i1 @foo0() nounwind { +; MIPSELR6: bnezc $1, $BB0_2 +; MIPSELR6-NEXT: nop +; MIPSELR6: jr $ra +entry: + %0 = icmp eq i32 0, 1 + br i1 %0, label %2, label %3 + ret i1 %0 +2: + ret i1 %0 +3: + ret i1 %0 +} + +define i32 @foo1() nounwind { +; MIPSELR6: addiu $2, $2, 1 +; MIPSELR6-NEXT: .set noreorder +; MIPSELR6-NEXT: beqzc $2, $tmp0 +; MIPSELR6-NEXT: nop +; MIPSELR6-NEXT: .set reorder +; MIPSELR6: jrc $ra +entry: + %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b", "=r"() nounwind + ret i32 %0 +} + +define i32 @foo2() nounwind { +; MIPSELR6: .set push +; MIPSELR6-NEXT: .set at +; MIPSELR6-NEXT: .set macro +; MIPSELR6-NEXT: .set reorder +; MIPSELR6: .set noreorder +; MIPSELR6-NEXT: beqzc $9, End +; MIPSELR6-NEXT: nop +; MIPSELR6-NEXT: .set reorder +; MIPSELR6: addiu $9, $9, 1 +entry: + %0 = tail call i32 asm "beqzc $$t1, End", "=r"() nounwind + %1 = tail call i32 asm "addiu $$t1, $$t1, 1", "=r"() nounwind + %2 = add nsw i32 %1, %0 + ret i32 %2 +} + +define i32 @foo3() nounwind { +; MIPSELR6: addiu $2, $2, 1 +; MIPSELR6-NEXT: .set noreorder +; MIPSELR6-NEXT: beqzc $2, $tmp1 +; MIPSELR6-NEXT: nop +; MIPSELR6-NEXT: .set noreorder +; MIPSELR6-NEXT: j End +; MIPSELR6-NEXT: nop +; MIPSELR6-NEXT: .set reorder +entry: + %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b; j End", "=r"() nounwind + ret i32 %0 +} + +define i32 @foo4() nounwind { +; MIPSELR6: addiu $2, $2, 1 +; MIPSELR6-NEXT: .set noreorder +; MIPSELR6-NEXT: beqzc $2, $tmp2 +; MIPSELR6-NEXT: addiu $2, $2, 1 +; MIPSELR6-NEXT: .set reorder +entry: + %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b; addiu $0, $0, 1", "=r"() nounwind + ret i32 %0 +} diff --git a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll index afb79e55f4f90b..5d7cdbc0b69edf 100644 --- a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll +++ b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll @@ -14,7 +14,7 @@ define i32 @test(i32 signext %x, i32 signext %c) { ; CHECK-NEXT: sltiu $1, $5, 4 ; CHECK-NEXT: beqz $1, $BB0_6 ; CHECK-NEXT: addu $3, $2, $25 -; CHECK-NEXT: $BB0_1: # %entry +; CHECK-NEXT: # %bb.1: # %entry ; CHECK-NEXT: li16 $2, 0 ; CHECK-NEXT: sll16 $5, $5, 2 ; CHECK-NEXT: lw $6, %got($JTI0_0)($3) diff --git a/llvm/test/CodeGen/PowerPC/crsave.ll b/llvm/test/CodeGen/PowerPC/crsave.ll index 81e7a0adcc8ca1..05da108e1fbf87 100644 --- a/llvm/test/CodeGen/PowerPC/crsave.ll +++ b/llvm/test/CodeGen/PowerPC/crsave.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -O0 -frame-pointer=all -mtriple=powerpc-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC32 ; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC64 ; RUN: llc -O0 -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefix=PPC64-ELFv2 @@ -5,6 +6,102 @@ declare void @foo() define i32 @test_cr2() nounwind uwtable { +; PPC32-LABEL: test_cr2: +; PPC32: # %bb.0: # %entry +; PPC32-NEXT: mflr 0 +; PPC32-NEXT: stwu 1, -32(1) +; PPC32-NEXT: stw 31, 28(1) +; PPC32-NEXT: stw 0, 36(1) +; PPC32-NEXT: .cfi_def_cfa_offset 32 +; PPC32-NEXT: .cfi_offset r31, -4 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: mr 31, 1 +; PPC32-NEXT: .cfi_def_cfa_register r31 +; PPC32-NEXT: .cfi_offset cr2, -8 +; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 12, 24(31) +; PPC32-NEXT: li 3, 1 +; PPC32-NEXT: li 4, 2 +; PPC32-NEXT: li 5, 3 +; PPC32-NEXT: li 6, 0 +; PPC32-NEXT: #APP +; PPC32-EMPTY: +; PPC32-NEXT: mtcr 6 +; PPC32-NEXT: cmpw 2, 4, 3 +; PPC32-NEXT: mfcr 3 +; PPC32-NEXT: #NO_APP +; PPC32-NEXT: stw 3, 20(31) +; PPC32-NEXT: bl foo +; PPC32-NEXT: lwz 3, 20(31) +; PPC32-NEXT: lwz 12, 24(31) +; PPC32-NEXT: mtocrf 32, 12 +; PPC32-NEXT: lwz 0, 36(1) +; PPC32-NEXT: lwz 31, 28(1) +; PPC32-NEXT: addi 1, 1, 32 +; PPC32-NEXT: mtlr 0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: test_cr2: +; PPC64: # %bb.0: # %entry +; PPC64-NEXT: mflr 0 +; PPC64-NEXT: mfcr 12 +; PPC64-NEXT: stw 12, 8(1) +; PPC64-NEXT: stdu 1, -128(1) +; PPC64-NEXT: std 0, 144(1) +; PPC64-NEXT: .cfi_def_cfa_offset 128 +; PPC64-NEXT: .cfi_offset lr, 16 +; PPC64-NEXT: .cfi_offset cr2, 8 +; PPC64-NEXT: li 3, 1 +; PPC64-NEXT: li 4, 2 +; PPC64-NEXT: li 5, 3 +; PPC64-NEXT: li 6, 0 +; PPC64-NEXT: #APP +; PPC64-EMPTY: +; PPC64-NEXT: mtcr 6 +; PPC64-NEXT: cmpw 2, 4, 3 +; PPC64-NEXT: mfcr 3 +; PPC64-NEXT: #NO_APP +; PPC64-NEXT: stw 3, 124(1) +; PPC64-NEXT: bl foo +; PPC64-NEXT: nop +; PPC64-NEXT: lwz 3, 124(1) +; PPC64-NEXT: addi 1, 1, 128 +; PPC64-NEXT: ld 0, 16(1) +; PPC64-NEXT: lwz 12, 8(1) +; PPC64-NEXT: mtocrf 32, 12 +; PPC64-NEXT: mtlr 0 +; PPC64-NEXT: blr +; +; PPC64-ELFv2-LABEL: test_cr2: +; PPC64-ELFv2: # %bb.0: # %entry +; PPC64-ELFv2-NEXT: mflr 0 +; PPC64-ELFv2-NEXT: mfocrf 12, 32 +; PPC64-ELFv2-NEXT: stw 12, 8(1) +; PPC64-ELFv2-NEXT: stdu 1, -112(1) +; PPC64-ELFv2-NEXT: std 0, 128(1) +; PPC64-ELFv2-NEXT: .cfi_def_cfa_offset 112 +; PPC64-ELFv2-NEXT: .cfi_offset lr, 16 +; PPC64-ELFv2-NEXT: .cfi_offset cr2, 8 +; PPC64-ELFv2-NEXT: li 3, 1 +; PPC64-ELFv2-NEXT: li 4, 2 +; PPC64-ELFv2-NEXT: li 5, 3 +; PPC64-ELFv2-NEXT: li 6, 0 +; PPC64-ELFv2-NEXT: #APP +; PPC64-ELFv2-EMPTY: +; PPC64-ELFv2-NEXT: mtcr 6 +; PPC64-ELFv2-NEXT: cmpw 2, 4, 3 +; PPC64-ELFv2-NEXT: mfcr 3 +; PPC64-ELFv2-NEXT: #NO_APP +; PPC64-ELFv2-NEXT: stw 3, 108(1) +; PPC64-ELFv2-NEXT: bl foo +; PPC64-ELFv2-NEXT: nop +; PPC64-ELFv2-NEXT: lwz 3, 108(1) +; PPC64-ELFv2-NEXT: addi 1, 1, 112 +; PPC64-ELFv2-NEXT: ld 0, 16(1) +; PPC64-ELFv2-NEXT: lwz 12, 8(1) +; PPC64-ELFv2-NEXT: mtocrf 32, 12 +; PPC64-ELFv2-NEXT: mtlr 0 +; PPC64-ELFv2-NEXT: blr entry: %ret = alloca i32, align 4 %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind @@ -14,27 +111,104 @@ entry: ret i32 %1 } -; PPC32-LABEL: test_cr2: -; PPC32: stwu 1, -32(1) -; PPC32: stw 31, 28(1) -; PPC32: mfcr 12 -; PPC32-NEXT: stw 12, 24(31) -; PPC32: lwz 12, 24(31) -; PPC32-NEXT: mtocrf 32, 12 - -; PPC64: .cfi_startproc -; PPC64: mfcr 12 -; PPC64: stw 12, 8(1) -; PPC64: stdu 1, -[[AMT:[0-9]+]](1) -; PPC64: .cfi_def_cfa_offset 128 -; PPC64: .cfi_offset lr, 16 -; PPC64: .cfi_offset cr2, 8 -; PPC64: addi 1, 1, [[AMT]] -; PPC64: lwz 12, 8(1) -; PPC64: mtocrf 32, 12 -; PPC64: .cfi_endproc - define i32 @test_cr234() nounwind { +; PPC32-LABEL: test_cr234: +; PPC32: # %bb.0: # %entry +; PPC32-NEXT: mflr 0 +; PPC32-NEXT: stwu 1, -32(1) +; PPC32-NEXT: stw 31, 28(1) +; PPC32-NEXT: stw 0, 36(1) +; PPC32-NEXT: mr 31, 1 +; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 12, 24(31) +; PPC32-NEXT: li 3, 1 +; PPC32-NEXT: li 4, 2 +; PPC32-NEXT: li 5, 3 +; PPC32-NEXT: li 6, 0 +; PPC32-NEXT: #APP +; PPC32-EMPTY: +; PPC32-NEXT: mtcr 6 +; PPC32-NEXT: cmpw 2, 4, 3 +; PPC32-NEXT: cmpw 3, 4, 4 +; PPC32-NEXT: cmpw 4, 4, 5 +; PPC32-NEXT: mfcr 3 +; PPC32-NEXT: #NO_APP +; PPC32-NEXT: stw 3, 20(31) +; PPC32-NEXT: bl foo +; PPC32-NEXT: lwz 3, 20(31) +; PPC32-NEXT: lwz 12, 24(31) +; PPC32-NEXT: mtocrf 32, 12 +; PPC32-NEXT: mtocrf 16, 12 +; PPC32-NEXT: mtocrf 8, 12 +; PPC32-NEXT: lwz 0, 36(1) +; PPC32-NEXT: lwz 31, 28(1) +; PPC32-NEXT: addi 1, 1, 32 +; PPC32-NEXT: mtlr 0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: test_cr234: +; PPC64: # %bb.0: # %entry +; PPC64-NEXT: mflr 0 +; PPC64-NEXT: mfcr 12 +; PPC64-NEXT: stw 12, 8(1) +; PPC64-NEXT: stdu 1, -128(1) +; PPC64-NEXT: std 0, 144(1) +; PPC64-NEXT: li 3, 1 +; PPC64-NEXT: li 4, 2 +; PPC64-NEXT: li 5, 3 +; PPC64-NEXT: li 6, 0 +; PPC64-NEXT: #APP +; PPC64-EMPTY: +; PPC64-NEXT: mtcr 6 +; PPC64-NEXT: cmpw 2, 4, 3 +; PPC64-NEXT: cmpw 3, 4, 4 +; PPC64-NEXT: cmpw 4, 4, 5 +; PPC64-NEXT: mfcr 3 +; PPC64-NEXT: #NO_APP +; PPC64-NEXT: stw 3, 124(1) +; PPC64-NEXT: bl foo +; PPC64-NEXT: nop +; PPC64-NEXT: lwz 3, 124(1) +; PPC64-NEXT: addi 1, 1, 128 +; PPC64-NEXT: ld 0, 16(1) +; PPC64-NEXT: lwz 12, 8(1) +; PPC64-NEXT: mtocrf 32, 12 +; PPC64-NEXT: mtocrf 16, 12 +; PPC64-NEXT: mtocrf 8, 12 +; PPC64-NEXT: mtlr 0 +; PPC64-NEXT: blr +; +; PPC64-ELFv2-LABEL: test_cr234: +; PPC64-ELFv2: # %bb.0: # %entry +; PPC64-ELFv2-NEXT: mflr 0 +; PPC64-ELFv2-NEXT: mfcr 12 +; PPC64-ELFv2-NEXT: stw 12, 8(1) +; PPC64-ELFv2-NEXT: stdu 1, -112(1) +; PPC64-ELFv2-NEXT: std 0, 128(1) +; PPC64-ELFv2-NEXT: li 3, 1 +; PPC64-ELFv2-NEXT: li 4, 2 +; PPC64-ELFv2-NEXT: li 5, 3 +; PPC64-ELFv2-NEXT: li 6, 0 +; PPC64-ELFv2-NEXT: #APP +; PPC64-ELFv2-EMPTY: +; PPC64-ELFv2-NEXT: mtcr 6 +; PPC64-ELFv2-NEXT: cmpw 2, 4, 3 +; PPC64-ELFv2-NEXT: cmpw 3, 4, 4 +; PPC64-ELFv2-NEXT: cmpw 4, 4, 5 +; PPC64-ELFv2-NEXT: mfcr 3 +; PPC64-ELFv2-NEXT: #NO_APP +; PPC64-ELFv2-NEXT: stw 3, 108(1) +; PPC64-ELFv2-NEXT: bl foo +; PPC64-ELFv2-NEXT: nop +; PPC64-ELFv2-NEXT: lwz 3, 108(1) +; PPC64-ELFv2-NEXT: addi 1, 1, 112 +; PPC64-ELFv2-NEXT: ld 0, 16(1) +; PPC64-ELFv2-NEXT: lwz 12, 8(1) +; PPC64-ELFv2-NEXT: mtocrf 32, 12 +; PPC64-ELFv2-NEXT: mtocrf 16, 12 +; PPC64-ELFv2-NEXT: mtocrf 8, 12 +; PPC64-ELFv2-NEXT: mtlr 0 +; PPC64-ELFv2-NEXT: blr entry: %ret = alloca i32, align 4 %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09cmpw 3,$2,$2\0A\09cmpw 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind @@ -44,41 +218,106 @@ entry: ret i32 %1 } -; PPC32-LABEL: test_cr234: -; PPC32: stwu 1, -32(1) -; PPC32: stw 31, 28(1) -; PPC32: mfcr 12 -; PPC32-NEXT: stw 12, 24(31) -; PPC32: lwz 12, 24(31) -; PPC32-NEXT: mtocrf 32, 12 -; PPC32-NEXT: mtocrf 16, 12 -; PPC32-NEXT: mtocrf 8, 12 - -; PPC64: mfcr 12 -; PPC64: stw 12, 8(1) -; PPC64: stdu 1, -[[AMT:[0-9]+]](1) -; PPC64: addi 1, 1, [[AMT]] -; PPC64: lwz 12, 8(1) -; PPC64: mtocrf 32, 12 -; PPC64: mtocrf 16, 12 -; PPC64: mtocrf 8, 12 - ; Generate mfocrf in prologue when we need to save 1 nonvolatile CR field define void @cloberOneNvCrField() { +; PPC32-LABEL: cloberOneNvCrField: +; PPC32: # %bb.0: # %entry +; PPC32-NEXT: stwu 1, -32(1) +; PPC32-NEXT: stw 31, 28(1) +; PPC32-NEXT: .cfi_def_cfa_offset 32 +; PPC32-NEXT: .cfi_offset r31, -4 +; PPC32-NEXT: mr 31, 1 +; PPC32-NEXT: .cfi_def_cfa_register r31 +; PPC32-NEXT: .cfi_offset cr2, -8 +; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 12, 24(31) +; PPC32-NEXT: #APP +; PPC32-NEXT: # clobbers +; PPC32-NEXT: #NO_APP +; PPC32-NEXT: lwz 12, 24(31) +; PPC32-NEXT: mtocrf 32, 12 +; PPC32-NEXT: lwz 31, 28(1) +; PPC32-NEXT: addi 1, 1, 32 +; PPC32-NEXT: blr +; +; PPC64-LABEL: cloberOneNvCrField: +; PPC64: # %bb.0: # %entry +; PPC64-NEXT: mfcr 12 +; PPC64-NEXT: stw 12, 8(1) +; PPC64-NEXT: #APP +; PPC64-NEXT: # clobbers +; PPC64-NEXT: #NO_APP +; PPC64-NEXT: lwz 12, 8(1) +; PPC64-NEXT: mtocrf 32, 12 +; PPC64-NEXT: blr +; +; PPC64-ELFv2-LABEL: cloberOneNvCrField: +; PPC64-ELFv2: # %bb.0: # %entry +; PPC64-ELFv2-NEXT: mfocrf 12, 32 +; PPC64-ELFv2-NEXT: stw 12, 8(1) +; PPC64-ELFv2-NEXT: #APP +; PPC64-ELFv2-NEXT: # clobbers +; PPC64-ELFv2-NEXT: #NO_APP +; PPC64-ELFv2-NEXT: lwz 12, 8(1) +; PPC64-ELFv2-NEXT: mtocrf 32, 12 +; PPC64-ELFv2-NEXT: blr entry: tail call void asm sideeffect "# clobbers", "~{cr2}"() ret void - -; PPC64-ELFv2-LABEL: @cloberOneNvCrField -; PPC64-ELFv2: mfocrf [[REG1:[0-9]+]], 32 } ; Generate mfcr in prologue when we need to save all nonvolatile CR field define void @cloberAllNvCrField() { +; PPC32-LABEL: cloberAllNvCrField: +; PPC32: # %bb.0: # %entry +; PPC32-NEXT: stwu 1, -32(1) +; PPC32-NEXT: stw 31, 28(1) +; PPC32-NEXT: .cfi_def_cfa_offset 32 +; PPC32-NEXT: .cfi_offset r31, -4 +; PPC32-NEXT: mr 31, 1 +; PPC32-NEXT: .cfi_def_cfa_register r31 +; PPC32-NEXT: .cfi_offset cr2, -8 +; PPC32-NEXT: .cfi_offset cr3, -8 +; PPC32-NEXT: .cfi_offset cr4, -8 +; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 12, 24(31) +; PPC32-NEXT: #APP +; PPC32-NEXT: # clobbers +; PPC32-NEXT: #NO_APP +; PPC32-NEXT: lwz 12, 24(31) +; PPC32-NEXT: mtocrf 32, 12 +; PPC32-NEXT: mtocrf 16, 12 +; PPC32-NEXT: mtocrf 8, 12 +; PPC32-NEXT: lwz 31, 28(1) +; PPC32-NEXT: addi 1, 1, 32 +; PPC32-NEXT: blr +; +; PPC64-LABEL: cloberAllNvCrField: +; PPC64: # %bb.0: # %entry +; PPC64-NEXT: mfcr 12 +; PPC64-NEXT: stw 12, 8(1) +; PPC64-NEXT: #APP +; PPC64-NEXT: # clobbers +; PPC64-NEXT: #NO_APP +; PPC64-NEXT: lwz 12, 8(1) +; PPC64-NEXT: mtocrf 32, 12 +; PPC64-NEXT: mtocrf 16, 12 +; PPC64-NEXT: mtocrf 8, 12 +; PPC64-NEXT: blr +; +; PPC64-ELFv2-LABEL: cloberAllNvCrField: +; PPC64-ELFv2: # %bb.0: # %entry +; PPC64-ELFv2-NEXT: mfcr 12 +; PPC64-ELFv2-NEXT: stw 12, 8(1) +; PPC64-ELFv2-NEXT: #APP +; PPC64-ELFv2-NEXT: # clobbers +; PPC64-ELFv2-NEXT: #NO_APP +; PPC64-ELFv2-NEXT: lwz 12, 8(1) +; PPC64-ELFv2-NEXT: mtocrf 32, 12 +; PPC64-ELFv2-NEXT: mtocrf 16, 12 +; PPC64-ELFv2-NEXT: mtocrf 8, 12 +; PPC64-ELFv2-NEXT: blr entry: tail call void asm sideeffect "# clobbers", "~{cr2},~{cr3},~{cr4}"() ret void - -; PPC64-ELFv2-LABEL: @cloberAllNvCrField -; PPC64-ELFv2: mfcr [[REG1:[0-9]+]] } diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll new file mode 100644 index 00000000000000..3e328c6ad9f0ba --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pr59074.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=LE64 +; RUN: llc -mtriple=powerpcle-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=LE32 +; RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 < %s | FileCheck %s --check-prefix=BE64 +; RUN: llc -mtriple=powerpc-ibm-aix -mcpu=pwr7 < %s | FileCheck %s --check-prefix=BE32 + +; To verify this doesn't crash due to array out of bound. +define void @pr59074(ptr %0) { +; LE64-LABEL: pr59074: +; LE64: # %bb.0: # %entry +; LE64-NEXT: lwz 6, 0(3) +; LE64-NEXT: li 7, 12 +; LE64-NEXT: ld 4, 16(3) +; LE64-NEXT: ld 5, 24(3) +; LE64-NEXT: addi 6, 6, -12 +; LE64-NEXT: std 4, 16(3) +; LE64-NEXT: std 5, 24(3) +; LE64-NEXT: srd 6, 7, 6 +; LE64-NEXT: li 7, 0 +; LE64-NEXT: std 7, 8(3) +; LE64-NEXT: std 6, 0(3) +; LE64-NEXT: blr +; +; LE32-LABEL: pr59074: +; LE32: # %bb.0: # %entry +; LE32-NEXT: stwu 1, -80(1) +; LE32-NEXT: .cfi_def_cfa_offset 80 +; LE32-NEXT: lwz 4, 0(3) +; LE32-NEXT: xxlxor 0, 0, 0 +; LE32-NEXT: li 5, 4 +; LE32-NEXT: addi 6, 1, 16 +; LE32-NEXT: li 7, 0 +; LE32-NEXT: li 8, 12 +; LE32-NEXT: xxswapd 0, 0 +; LE32-NEXT: addi 4, 4, -12 +; LE32-NEXT: rlwinm 9, 4, 29, 28, 31 +; LE32-NEXT: stxvd2x 0, 6, 5 +; LE32-NEXT: stw 7, 44(1) +; LE32-NEXT: stw 7, 40(1) +; LE32-NEXT: stw 7, 36(1) +; LE32-NEXT: stw 8, 16(1) +; LE32-NEXT: lwzux 5, 9, 6 +; LE32-NEXT: li 6, 7 +; LE32-NEXT: lwz 7, 8(9) +; LE32-NEXT: nand 6, 4, 6 +; LE32-NEXT: lwz 8, 4(9) +; LE32-NEXT: clrlwi 4, 4, 29 +; LE32-NEXT: lwz 9, 12(9) +; LE32-NEXT: clrlwi 6, 6, 27 +; LE32-NEXT: subfic 11, 4, 32 +; LE32-NEXT: srw 5, 5, 4 +; LE32-NEXT: slwi 10, 7, 1 +; LE32-NEXT: srw 7, 7, 4 +; LE32-NEXT: slw 6, 10, 6 +; LE32-NEXT: srw 10, 8, 4 +; LE32-NEXT: slw 8, 8, 11 +; LE32-NEXT: slw 11, 9, 11 +; LE32-NEXT: srw 4, 9, 4 +; LE32-NEXT: or 5, 8, 5 +; LE32-NEXT: or 7, 11, 7 +; LE32-NEXT: or 6, 10, 6 +; LE32-NEXT: stw 4, 12(3) +; LE32-NEXT: stw 7, 8(3) +; LE32-NEXT: stw 5, 0(3) +; LE32-NEXT: stw 6, 4(3) +; LE32-NEXT: addi 1, 1, 80 +; LE32-NEXT: blr +; +; BE64-LABEL: pr59074: +; BE64: # %bb.0: # %entry +; BE64-NEXT: lwz 6, 12(3) +; BE64-NEXT: li 7, 12 +; BE64-NEXT: ld 4, 24(3) +; BE64-NEXT: ld 5, 16(3) +; BE64-NEXT: addi 6, 6, -12 +; BE64-NEXT: std 4, 24(3) +; BE64-NEXT: std 5, 16(3) +; BE64-NEXT: srd 6, 7, 6 +; BE64-NEXT: li 7, 0 +; BE64-NEXT: std 7, 0(3) +; BE64-NEXT: std 6, 8(3) +; BE64-NEXT: blr +; +; BE32-LABEL: pr59074: +; BE32: # %bb.0: # %entry +; BE32-NEXT: lwz 4, 12(3) +; BE32-NEXT: xxlxor 0, 0, 0 +; BE32-NEXT: addi 5, 1, -64 +; BE32-NEXT: li 6, 12 +; BE32-NEXT: li 7, 0 +; BE32-NEXT: addi 8, 1, -48 +; BE32-NEXT: li 10, 7 +; BE32-NEXT: stxvw4x 0, 0, 5 +; BE32-NEXT: addi 4, 4, -12 +; BE32-NEXT: stw 6, -36(1) +; BE32-NEXT: stw 7, -40(1) +; BE32-NEXT: stw 7, -44(1) +; BE32-NEXT: rlwinm 9, 4, 29, 28, 31 +; BE32-NEXT: stw 7, -48(1) +; BE32-NEXT: sub 5, 8, 9 +; BE32-NEXT: nand 6, 4, 10 +; BE32-NEXT: clrlwi 4, 4, 29 +; BE32-NEXT: clrlwi 6, 6, 27 +; BE32-NEXT: lwz 7, 4(5) +; BE32-NEXT: lwz 8, 8(5) +; BE32-NEXT: lwz 9, 0(5) +; BE32-NEXT: lwz 5, 12(5) +; BE32-NEXT: slwi 10, 7, 1 +; BE32-NEXT: srw 11, 8, 4 +; BE32-NEXT: srw 7, 7, 4 +; BE32-NEXT: srw 5, 5, 4 +; BE32-NEXT: slw 6, 10, 6 +; BE32-NEXT: subfic 10, 4, 32 +; BE32-NEXT: srw 4, 9, 4 +; BE32-NEXT: slw 8, 8, 10 +; BE32-NEXT: slw 10, 9, 10 +; BE32-NEXT: or 6, 11, 6 +; BE32-NEXT: or 7, 10, 7 +; BE32-NEXT: or 5, 8, 5 +; BE32-NEXT: stw 4, 0(3) +; BE32-NEXT: stw 6, 8(3) +; BE32-NEXT: stw 5, 12(3) +; BE32-NEXT: stw 7, 4(3) +; BE32-NEXT: blr +entry: + %v1 = load <2 x i128>, <2 x i128>* %0 + %v2 = insertelement <2 x i128> %v1, i128 12, i32 0 + %v3 = sub <2 x i128> %v1, %v2 + %v4 = lshr <2 x i128> %v2, %v3 + store <2 x i128> %v4, <2 x i128>* %0 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll index 6f68679325c579..798637b6840f1e 100644 --- a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll +++ b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll @@ -7281,3 +7281,61 @@ entry: store double %str, ptr inttoptr (i64 1000000000000 to ptr), align 4096 ret void } + +define dso_local void @st_reversed_double_from_i8(ptr %ptr) { +; CHECK-P10-LABEL: st_reversed_double_from_i8: +; CHECK-P10: # %bb.0: # %entry +; CHECK-P10-NEXT: li r4, 8 +; CHECK-P10-NEXT: lxsibzx f0, 0, r3 +; CHECK-P10-NEXT: xxspltidp vs2, -1023410176 +; CHECK-P10-NEXT: lxsibzx f1, r3, r4 +; CHECK-P10-NEXT: xscvuxddp f0, f0 +; CHECK-P10-NEXT: xscvuxddp f1, f1 +; CHECK-P10-NEXT: xsadddp f0, f0, f2 +; CHECK-P10-NEXT: xsadddp f1, f1, f2 +; CHECK-P10-NEXT: stfd f1, 0(r3) +; CHECK-P10-NEXT: stfd f0, 8(r3) +; CHECK-P10-NEXT: blr +; +; CHECK-P9-LABEL: st_reversed_double_from_i8: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: li r4, 8 +; CHECK-P9-NEXT: lxsibzx f0, 0, r3 +; CHECK-P9-NEXT: lxsibzx f1, r3, r4 +; CHECK-P9-NEXT: addis r4, r2, .LCPI300_0@toc@ha +; CHECK-P9-NEXT: lfs f2, .LCPI300_0@toc@l(r4) +; CHECK-P9-NEXT: xscvuxddp f0, f0 +; CHECK-P9-NEXT: xscvuxddp f1, f1 +; CHECK-P9-NEXT: xsadddp f0, f0, f2 +; CHECK-P9-NEXT: xsadddp f1, f1, f2 +; CHECK-P9-NEXT: stfd f0, 8(r3) +; CHECK-P9-NEXT: stfd f1, 0(r3) +; CHECK-P9-NEXT: blr +; +; CHECK-P8-LABEL: st_reversed_double_from_i8: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: lbz r4, 0(r3) +; CHECK-P8-NEXT: lbz r5, 8(r3) +; CHECK-P8-NEXT: mtfprwz f0, r4 +; CHECK-P8-NEXT: mtfprwz f1, r5 +; CHECK-P8-NEXT: addis r4, r2, .LCPI300_0@toc@ha +; CHECK-P8-NEXT: lfs f2, .LCPI300_0@toc@l(r4) +; CHECK-P8-NEXT: xscvuxddp f0, f0 +; CHECK-P8-NEXT: xscvuxddp f1, f1 +; CHECK-P8-NEXT: xsadddp f0, f0, f2 +; CHECK-P8-NEXT: xsadddp f1, f1, f2 +; CHECK-P8-NEXT: stfd f1, 0(r3) +; CHECK-P8-NEXT: stfd f0, 8(r3) +; CHECK-P8-NEXT: blr +entry: + %idx = getelementptr inbounds i8, ptr %ptr, i64 8 + %i0 = load i8, ptr %ptr, align 1 + %i1 = load i8, ptr %idx, align 1 + %f0 = uitofp i8 %i0 to double + %f1 = uitofp i8 %i1 to double + %a0 = fadd double %f0, -1.280000e+02 + %a1 = fadd double %f1, -1.280000e+02 + store double %a1, ptr %ptr, align 8 + store double %a0, ptr %idx, align 8 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll index 824dd4c4db6cb7..f3960573421298 100644 --- a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll +++ b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll @@ -7271,3 +7271,61 @@ entry: store double %conv, ptr inttoptr (i64 1000000000000 to ptr), align 4096 ret void } + +define dso_local void @st_reversed_float_from_i8(ptr %ptr) { +; CHECK-P10-LABEL: st_reversed_float_from_i8: +; CHECK-P10: # %bb.0: # %entry +; CHECK-P10-NEXT: li r4, 8 +; CHECK-P10-NEXT: lxsibzx f0, 0, r3 +; CHECK-P10-NEXT: xxspltidp vs2, -1023410176 +; CHECK-P10-NEXT: lxsibzx f1, r3, r4 +; CHECK-P10-NEXT: xscvuxdsp f0, f0 +; CHECK-P10-NEXT: xscvuxdsp f1, f1 +; CHECK-P10-NEXT: xsaddsp f0, f0, f2 +; CHECK-P10-NEXT: xsaddsp f1, f1, f2 +; CHECK-P10-NEXT: stfs f0, 8(r3) +; CHECK-P10-NEXT: stfs f1, 0(r3) +; CHECK-P10-NEXT: blr +; +; CHECK-P9-LABEL: st_reversed_float_from_i8: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: li r4, 8 +; CHECK-P9-NEXT: lxsibzx f0, 0, r3 +; CHECK-P9-NEXT: lxsibzx f1, r3, r4 +; CHECK-P9-NEXT: addis r4, r2, .LCPI300_0@toc@ha +; CHECK-P9-NEXT: lfs f2, .LCPI300_0@toc@l(r4) +; CHECK-P9-NEXT: xscvuxdsp f0, f0 +; CHECK-P9-NEXT: xscvuxdsp f1, f1 +; CHECK-P9-NEXT: xsaddsp f0, f0, f2 +; CHECK-P9-NEXT: xsaddsp f1, f1, f2 +; CHECK-P9-NEXT: stfs f0, 8(r3) +; CHECK-P9-NEXT: stfs f1, 0(r3) +; CHECK-P9-NEXT: blr +; +; CHECK-P8-LABEL: st_reversed_float_from_i8: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: lbz r4, 0(r3) +; CHECK-P8-NEXT: lbz r5, 8(r3) +; CHECK-P8-NEXT: mtfprwz f0, r4 +; CHECK-P8-NEXT: mtfprwz f1, r5 +; CHECK-P8-NEXT: addis r4, r2, .LCPI300_0@toc@ha +; CHECK-P8-NEXT: lfs f2, .LCPI300_0@toc@l(r4) +; CHECK-P8-NEXT: xscvuxdsp f0, f0 +; CHECK-P8-NEXT: xscvuxdsp f1, f1 +; CHECK-P8-NEXT: xsaddsp f0, f0, f2 +; CHECK-P8-NEXT: xsaddsp f1, f1, f2 +; CHECK-P8-NEXT: stfs f1, 0(r3) +; CHECK-P8-NEXT: stfs f0, 8(r3) +; CHECK-P8-NEXT: blr +entry: + %idx = getelementptr inbounds i8, ptr %ptr, i64 8 + %i0 = load i8, ptr %ptr, align 1 + %i1 = load i8, ptr %idx, align 1 + %f0 = uitofp i8 %i0 to float + %f1 = uitofp i8 %i1 to float + %a0 = fadd float %f0, -1.280000e+02 + %a1 = fadd float %f1, -1.280000e+02 + store float %a1, ptr %ptr, align 8 + store float %a0, ptr %idx, align 8 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index b90bef7525379d..ecd14eaffcb5d5 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -84,7 +84,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+zvksh %s -o - | FileCheck --check-prefix=RV32ZVKSH %s ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV32ZVKT %s ; RUN: llc -mtriple=riscv32 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV32ZVFH %s -; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV32ZICOND %s +; RUN: llc -mtriple=riscv32 -mattr=+zicond %s -o - | FileCheck --check-prefix=RV32ZICOND %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV32ZIMOP %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV32ZCMOP %s ; RUN: llc -mtriple=riscv32 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SMAIA %s @@ -186,7 +186,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+zvksh %s -o - | FileCheck --check-prefix=RV64ZVKSH %s ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV64ZVKT %s ; RUN: llc -mtriple=riscv64 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV64ZVFH %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV64ZICOND %s +; RUN: llc -mtriple=riscv64 -mattr=+zicond %s -o - | FileCheck --check-prefix=RV64ZICOND %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV64ZIMOP %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV64ZCMOP %s ; RUN: llc -mtriple=riscv64 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SMAIA %s diff --git a/llvm/test/CodeGen/RISCV/branch-opt.mir b/llvm/test/CodeGen/RISCV/branch-opt.mir new file mode 100644 index 00000000000000..ba3a20f2fbfcd3 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/branch-opt.mir @@ -0,0 +1,68 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc %s -mtriple=riscv64 -run-pass=peephole-opt -o - | FileCheck %s + +# Make sure we shouldn't replace the %2 ADDI with the $x10 ADDI since it has a +# physical register destination. + +--- | + define void @foo(i32 signext %0) { + tail call void @bar(i32 1) + %2 = icmp ugt i32 %0, 1 + br i1 %2, label %3, label %4 + + 3: ; preds = %1 + tail call void @bar(i32 3) + ret void + + 4: ; preds = %1 + ret void + } + + declare void @bar(...) + +... +--- +name: foo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0 (%ir-block.1): + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2 + ; CHECK-NEXT: $x10 = ADDI $x0, 1 + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) @bar, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit-def $x2 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2 + ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 2 + ; CHECK-NEXT: BLTU [[COPY]], killed [[ADDI]], %bb.2 + ; CHECK-NEXT: PseudoBR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (%ir-block.3): + ; CHECK-NEXT: $x10 = ADDI $x0, 3 + ; CHECK-NEXT: PseudoTAIL target-flags(riscv-call) @bar, implicit $x2, implicit $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2 (%ir-block.4): + ; CHECK-NEXT: PseudoRET + bb.0 (%ir-block.1): + successors: %bb.1, %bb.2 + liveins: $x10 + + %0:gpr = COPY $x10 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2 + $x10 = ADDI $x0, 1 + PseudoCALL target-flags(riscv-call) @bar, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit-def $x2 + ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2 + %2:gpr = ADDI $x0, 2 + BLTU %0, killed %2, %bb.2 + PseudoBR %bb.1 + + bb.1 (%ir-block.3): + $x10 = ADDI $x0, 3 + PseudoTAIL target-flags(riscv-call) @bar, implicit $x2, implicit $x10 + + bb.2 (%ir-block.4): + PseudoRET + +... diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll index 6ad529ea477c1a..b26bd7b889807a 100644 --- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll +++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll @@ -3,13 +3,13 @@ ; RUN: | FileCheck -check-prefix=NOCMOV %s ; RUN: llc -mtriple=riscv64 -mattr=+conditional-cmv-fusion,+c -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CMOV,CMOV-NOZICOND %s -; RUN: llc -mtriple=riscv64 -mattr=+conditional-cmv-fusion,+c,+experimental-zicond -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+conditional-cmv-fusion,+c,+zicond -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CMOV,CMOV-ZICOND %s ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND %s ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+c -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND %s -; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+experimental-zicond -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+zicond -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-ZICOND %s ; The conditional move optimization in sifive-p450 requires that only a diff --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll index 87ebefcb65c7de..1a661fddacfa05 100644 --- a/llvm/test/CodeGen/RISCV/condbinops.ll +++ b/llvm/test/CodeGen/RISCV/condbinops.ll @@ -3,8 +3,8 @@ ; RUN: llc -mtriple=riscv64 < %s | FileCheck %s -check-prefix=RV64I ; RUN: llc -mtriple=riscv64 -mattr=+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS ; RUN: llc -mtriple=riscv64 -mattr=+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV -; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond < %s | FileCheck %s -check-prefix=RV32ZICOND -; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond < %s | FileCheck %s -check-prefix=RV64ZICOND +; RUN: llc -mtriple=riscv32 -mattr=+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND +; RUN: llc -mtriple=riscv64 -mattr=+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND define i32 @shl32(i32 %x, i32 %y, i1 %c) { ; RV32I-LABEL: shl32: diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll index 23f219c2487ebc..101cb5aeeb0940 100644 --- a/llvm/test/CodeGen/RISCV/condops.ll +++ b/llvm/test/CodeGen/RISCV/condops.ll @@ -3,8 +3,8 @@ ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV64I ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV -; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+experimental-zicond < %s | FileCheck %s -check-prefix=RV32ZICOND -; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+experimental-zicond < %s | FileCheck %s -check-prefix=RV64ZICOND +; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND +; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND define i64 @zero1(i64 %rs1, i1 zeroext %rc) { ; RV32I-LABEL: zero1: @@ -3719,3 +3719,54 @@ entry: %cond = select i1 %tobool.not, i64 0, i64 %x ret i64 %cond } + +; Test that we don't crash on types larger than 64 bits. +define i64 @single_bit3(i80 %x, i64 %y) { +; RV32I-LABEL: single_bit3: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lw a0, 8(a0) +; RV32I-NEXT: slli a0, a0, 31 +; RV32I-NEXT: srai a3, a0, 31 +; RV32I-NEXT: and a0, a3, a1 +; RV32I-NEXT: and a1, a3, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: single_bit3: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 63 +; RV64I-NEXT: srai a0, a1, 63 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: ret +; +; RV64XVENTANACONDOPS-LABEL: single_bit3: +; RV64XVENTANACONDOPS: # %bb.0: # %entry +; RV64XVENTANACONDOPS-NEXT: andi a1, a1, 1 +; RV64XVENTANACONDOPS-NEXT: vt.maskc a0, a2, a1 +; RV64XVENTANACONDOPS-NEXT: ret +; +; RV64XTHEADCONDMOV-LABEL: single_bit3: +; RV64XTHEADCONDMOV: # %bb.0: # %entry +; RV64XTHEADCONDMOV-NEXT: slli a1, a1, 63 +; RV64XTHEADCONDMOV-NEXT: srai a0, a1, 63 +; RV64XTHEADCONDMOV-NEXT: and a0, a0, a2 +; RV64XTHEADCONDMOV-NEXT: ret +; +; RV32ZICOND-LABEL: single_bit3: +; RV32ZICOND: # %bb.0: # %entry +; RV32ZICOND-NEXT: lw a0, 8(a0) +; RV32ZICOND-NEXT: andi a3, a0, 1 +; RV32ZICOND-NEXT: czero.eqz a0, a1, a3 +; RV32ZICOND-NEXT: czero.eqz a1, a2, a3 +; RV32ZICOND-NEXT: ret +; +; RV64ZICOND-LABEL: single_bit3: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: andi a1, a1, 1 +; RV64ZICOND-NEXT: czero.eqz a0, a2, a1 +; RV64ZICOND-NEXT: ret +entry: + %and = and i80 %x, 18446744073709551616 ; 1 << 64 + %tobool.not = icmp eq i80 %and, 0 + %cond = select i1 %tobool.not, i64 0, i64 %y + ret i64 %cond +} diff --git a/llvm/test/CodeGen/RISCV/pr90652.ll b/llvm/test/CodeGen/RISCV/pr90652.ll new file mode 100644 index 00000000000000..2162395b92ac3c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/pr90652.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=riscv64 | FileCheck %s + +define i1 @test(i64 %x, i1 %cond1, i1 %cond2) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a3, a0, 1 +; CHECK-NEXT: slt a0, a3, a0 +; CHECK-NEXT: not a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: or a0, a2, a0 +; CHECK-NEXT: ret +entry: + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %x, i64 1) + %ov = extractvalue { i64, i1 } %sadd, 1 + %or = or i1 %cond2, %ov + %sel = select i1 %cond1, i1 %cond2, i1 %or + ret i1 %sel +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index dab530751ef96b..799aebcaa63026 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -238,26 +238,39 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) { define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-LABEL: interleave_v32f32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v8, 16 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v24, v0, v8 -; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v24, a0, v8 -; V128-NEXT: lui a1, %hi(.LCPI10_0) -; V128-NEXT: addi a1, a1, %lo(.LCPI10_0) -; V128-NEXT: li a2, 32 -; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; V128-NEXT: vle16.v v12, (a1) -; V128-NEXT: lui a1, 699051 -; V128-NEXT: addi a1, a1, -1366 -; V128-NEXT: vmv.s.x v0, a1 +; V128-NEXT: addi sp, sp, -16 +; V128-NEXT: .cfi_def_cfa_offset 16 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: sub sp, sp, a0 +; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; V128-NEXT: lui a0, %hi(.LCPI10_0) +; V128-NEXT: addi a0, a0, %lo(.LCPI10_0) +; V128-NEXT: li a1, 32 +; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; V128-NEXT: vle16.v v4, (a0) +; V128-NEXT: lui a0, %hi(.LCPI10_1) +; V128-NEXT: addi a0, a0, %lo(.LCPI10_1) +; V128-NEXT: vle16.v v24, (a0) +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill +; V128-NEXT: lui a0, 699051 +; V128-NEXT: addi a0, a0, -1366 +; V128-NEXT: vmv.s.x v0, a0 +; V128-NEXT: vrgatherei16.vv v24, v8, v4 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload ; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v0, v8, v16 +; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v0, a0, v16 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: add sp, sp, a0 +; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32f32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index 9e21cc9e3d624a..e1bd16649eede7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -188,30 +188,24 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { ; V128-LABEL: interleave_v4i32_offset_1: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V128-NEXT: vwaddu.vv v10, v8, v8 -; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v8 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; V128-NEXT: vid.v v8 -; V128-NEXT: vsrl.vi v8, v8, 1 +; V128-NEXT: vid.v v10 +; V128-NEXT: vsrl.vi v11, v10, 1 +; V128-NEXT: vrgather.vv v10, v8, v11 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vadd.vi v8, v8, 1 +; V128-NEXT: vadd.vi v8, v11, 1 ; V128-NEXT: vrgather.vv v10, v9, v8, v0.t ; V128-NEXT: vmv.v.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i32_offset_1: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V512-NEXT: vwaddu.vv v10, v8, v8 -; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v8 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu -; V512-NEXT: vid.v v8 -; V512-NEXT: vsrl.vi v8, v8, 1 +; V512-NEXT: vid.v v10 +; V512-NEXT: vsrl.vi v11, v10, 1 +; V512-NEXT: vrgather.vv v10, v8, v11 ; V512-NEXT: vmv.v.i v0, 10 -; V512-NEXT: vadd.vi v8, v8, 1 +; V512-NEXT: vadd.vi v8, v11, 1 ; V512-NEXT: vrgather.vv v10, v9, v8, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret @@ -403,26 +397,39 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) { define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-LABEL: interleave_v32i32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v8, 16 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v24, v0, v8 -; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v24, a0, v8 -; V128-NEXT: lui a1, %hi(.LCPI17_0) -; V128-NEXT: addi a1, a1, %lo(.LCPI17_0) -; V128-NEXT: li a2, 32 -; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; V128-NEXT: vle16.v v12, (a1) -; V128-NEXT: lui a1, 699051 -; V128-NEXT: addi a1, a1, -1366 -; V128-NEXT: vmv.s.x v0, a1 +; V128-NEXT: addi sp, sp, -16 +; V128-NEXT: .cfi_def_cfa_offset 16 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: sub sp, sp, a0 +; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; V128-NEXT: lui a0, %hi(.LCPI17_0) +; V128-NEXT: addi a0, a0, %lo(.LCPI17_0) +; V128-NEXT: li a1, 32 +; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; V128-NEXT: vle16.v v4, (a0) +; V128-NEXT: lui a0, %hi(.LCPI17_1) +; V128-NEXT: addi a0, a0, %lo(.LCPI17_1) +; V128-NEXT: vle16.v v24, (a0) +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill +; V128-NEXT: lui a0, 699051 +; V128-NEXT: addi a0, a0, -1366 +; V128-NEXT: vmv.s.x v0, a0 +; V128-NEXT: vrgatherei16.vv v24, v8, v4 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload ; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v0, v8, v16 +; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v0, a0, v16 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 2 +; V128-NEXT: add sp, sp, a0 +; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index a26a87a1f3c139..a56a81f5f793bc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -612,11 +612,13 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: concat_4xi8_start_undef_at_start: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: li a0, 224 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -4 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -4 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -626,11 +628,13 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_start_into_end_non_contiguous: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: li a0, 144 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -4 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -4 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -671,11 +675,13 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_slidedown: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: li a0, 195 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -686,12 +692,14 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 2 +; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: li a0, 234 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -702,13 +710,16 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: unmergable: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vadd.vi v11, v10, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI46_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: li a0, 234 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vrgather.vv v10, v9, v12, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index f889041647b235..eeb8e517d01d2d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -8,51 +8,23 @@ ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) { -; RV32-LABEL: load_factor2_v3: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV32-NEXT: vle32.v v10, (a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v10, 2 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v8, v10, v9 -; RV32-NEXT: li a0, -1 -; RV32-NEXT: vwmaccu.vx v8, a0, v9 -; RV32-NEXT: vmv.v.i v0, 4 -; RV32-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v12, v10, 4 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vrgather.vi v8, v12, 0, v0.t -; RV32-NEXT: vid.v v9 -; RV32-NEXT: vadd.vv v9, v9, v9 -; RV32-NEXT: vadd.vi v11, v9, 1 -; RV32-NEXT: vrgather.vv v9, v10, v11 -; RV32-NEXT: vrgather.vi v9, v12, 1, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: load_factor2_v3: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV64-NEXT: vle32.v v10, (a0) -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vadd.vi v8, v8, 1 -; RV64-NEXT: vrgather.vv v9, v10, v8 -; RV64-NEXT: vmv.v.i v0, 4 -; RV64-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v10, 4 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vrgather.vi v9, v12, 1, v0.t -; RV64-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v11, v10, 2 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v8, v10, v11 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vwmaccu.vx v8, a0, v11 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vrgather.vi v8, v12, 0, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: load_factor2_v3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vadd.vv v9, v8, v8 +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v10, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t +; CHECK-NEXT: vadd.vi v11, v9, 1 +; CHECK-NEXT: vrgather.vv v9, v10, v11 +; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t +; CHECK-NEXT: ret %interleaved.vec = load <6 x i32>, ptr %ptr %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> @@ -159,142 +131,163 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 58 +; RV32-NEXT: li a3, 62 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb -; RV32-NEXT: addi a3, a1, 256 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 62 * vlenb +; RV32-NEXT: addi a3, a1, 128 +; RV32-NEXT: addi a4, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a3) -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 25 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a1, 128 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vslideup.vi v16, v8, 4 +; RV32-NEXT: vle32.v v16, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 12 +; RV32-NEXT: li a5, 29 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vid.v v20 -; RV32-NEXT: vadd.vi v4, v20, -10 -; RV32-NEXT: vmv.v.v v2, v20 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV32-NEXT: vid.v v10 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 4 +; RV32-NEXT: slli a5, a4, 3 ; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs2r.v v20, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs2r.v v10, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vi v8, v10, -4 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 13 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v12, v16, v8 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 21 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs4r.v v12, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vadd.vi v8, v10, -10 ; RV32-NEXT: lui a4, 12 -; RV32-NEXT: vmv.s.x v1, a4 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vmv.s.x v0, a4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 5 -; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: li a5, 45 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs1r.v v1, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vrgatherei16.vv v16, v8, v4, v0.t +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 21 +; RV32-NEXT: li a5, 25 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, %hi(.LCPI6_0) ; RV32-NEXT: addi a4, a4, %lo(.LCPI6_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: lui a5, %hi(.LCPI6_1) +; RV32-NEXT: addi a5, a5, %lo(.LCPI6_1) +; RV32-NEXT: lui a6, 1 ; RV32-NEXT: vle16.v v8, (a4) +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v8, (a5) ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 2 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, %hi(.LCPI6_1) -; RV32-NEXT: addi a4, a4, %lo(.LCPI6_1) -; RV32-NEXT: lui a5, 1 -; RV32-NEXT: vle16.v v8, (a4) -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 49 +; RV32-NEXT: li a4, 37 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, a5, -64 +; RV32-NEXT: addi a1, a6, -64 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v4 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4 -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vi v8, v10, -2 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v12, v8, 2 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v2, -8 +; RV32-NEXT: vadd.vi v8, v10, -8 +; RV32-NEXT: vmv2r.v v30, v10 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v20, v12 +; RV32-NEXT: vmv.v.v v24, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu @@ -308,165 +301,166 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 +; RV32-NEXT: vmv.v.v v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8 +; RV32-NEXT: vrgatherei16.vv v4, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vadd.vi v8, v30, -6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vi v8, v8, -6 +; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: vmv1r.v v2, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v4, v16, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_6) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_6) -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v28, (a3) +; RV32-NEXT: vle16.v v20, (a1) +; RV32-NEXT: vle16.v v8, (a3) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs1r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v1, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 -; RV32-NEXT: vmv1r.v v0, v20 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v8, v20 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v8 +; RV32-NEXT: vmv.v.v v4, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vi v8, v8, -4 -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v24, v12 +; RV32-NEXT: vmv.v.v v4, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_8) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_8) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_9) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_9) ; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v28, (a3) +; RV32-NEXT: vle16.v v20, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v8 +; RV32-NEXT: vmv.v.v v4, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) @@ -474,20 +468,25 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v20, v16, 6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v16, v10 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -502,13 +501,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a1, 1008 ; RV32-NEXT: vmv.s.x v28, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v28, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -516,7 +515,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -529,19 +528,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -554,33 +553,33 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: vle16.v v8, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 5 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: li a2, 45 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 49 +; RV32-NEXT: li a2, 37 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 25 +; RV32-NEXT: li a2, 29 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 41 +; RV32-NEXT: li a2, 53 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 5 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: li a2, 45 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload @@ -594,35 +593,37 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a3, a2, 4 -; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: slli a3, a2, 3 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a3, a2, 4 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 21 +; RV32-NEXT: li a2, 25 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 58 +; RV32-NEXT: li a1, 62 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index df41ac10f80d36..88c299a19fb4e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -14638,5 +14638,496 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %allones, <8 x i16> poison) ret <8 x i16> %v } + +; v32i64 is not a legal type, so make sure we don't try to combine the mgather +; to a vlse intrinsic until it is legalized and split. +define <32 x i64> @mgather_strided_split(ptr %base) { +; RV32V-LABEL: mgather_strided_split: +; RV32V: # %bb.0: +; RV32V-NEXT: li a1, 16 +; RV32V-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32V-NEXT: vlse64.v v8, (a0), a1 +; RV32V-NEXT: addi a0, a0, 256 +; RV32V-NEXT: vlse64.v v16, (a0), a1 +; RV32V-NEXT: ret +; +; RV64V-LABEL: mgather_strided_split: +; RV64V: # %bb.0: +; RV64V-NEXT: li a1, 16 +; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64V-NEXT: vlse64.v v8, (a0), a1 +; RV64V-NEXT: addi a0, a0, 256 +; RV64V-NEXT: vlse64.v v16, (a0), a1 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_strided_split: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -512 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 512 +; RV32ZVE32F-NEXT: sw ra, 508(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s0, 504(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 500(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 496(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s4, 492(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s5, 488(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s6, 484(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s7, 480(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s8, 476(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s9, 472(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s10, 468(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s11, 464(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset ra, -4 +; RV32ZVE32F-NEXT: .cfi_offset s0, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: .cfi_offset s3, -16 +; RV32ZVE32F-NEXT: .cfi_offset s4, -20 +; RV32ZVE32F-NEXT: .cfi_offset s5, -24 +; RV32ZVE32F-NEXT: .cfi_offset s6, -28 +; RV32ZVE32F-NEXT: .cfi_offset s7, -32 +; RV32ZVE32F-NEXT: .cfi_offset s8, -36 +; RV32ZVE32F-NEXT: .cfi_offset s9, -40 +; RV32ZVE32F-NEXT: .cfi_offset s10, -44 +; RV32ZVE32F-NEXT: .cfi_offset s11, -48 +; RV32ZVE32F-NEXT: addi s0, sp, 512 +; RV32ZVE32F-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVE32F-NEXT: andi sp, sp, -128 +; RV32ZVE32F-NEXT: li a2, 32 +; RV32ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32ZVE32F-NEXT: vid.v v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 4 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 216(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 208(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 252(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 248(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 244(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 236(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 228(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 220(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 240(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 232(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 224(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 212(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 204(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 200(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v16 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 196(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 192(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: addi a1, sp, 256 +; RV32ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32ZVE32F-NEXT: vse32.v v8, (a1) +; RV32ZVE32F-NEXT: lw a1, 288(sp) +; RV32ZVE32F-NEXT: lw a2, 292(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 188(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 184(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 296(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 180(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 176(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 300(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 172(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 168(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 304(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 164(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 160(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 308(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 156(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 152(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 312(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 148(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 144(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 316(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 140(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 136(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 320(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 132(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 128(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 324(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 124(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 120(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 328(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: sw a3, 116(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: sw a2, 112(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 332(sp) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 104(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw ra, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 336(sp) +; RV32ZVE32F-NEXT: lw s10, 0(a2) +; RV32ZVE32F-NEXT: lw s8, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 340(sp) +; RV32ZVE32F-NEXT: lw s6, 0(a1) +; RV32ZVE32F-NEXT: lw s4, 4(a1) +; RV32ZVE32F-NEXT: lw a4, 344(sp) +; RV32ZVE32F-NEXT: lw s2, 0(a2) +; RV32ZVE32F-NEXT: lw t5, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 348(sp) +; RV32ZVE32F-NEXT: lw t3, 0(a4) +; RV32ZVE32F-NEXT: lw t2, 4(a4) +; RV32ZVE32F-NEXT: lw a4, 352(sp) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a7, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 356(sp) +; RV32ZVE32F-NEXT: lw a6, 0(a4) +; RV32ZVE32F-NEXT: lw a5, 4(a4) +; RV32ZVE32F-NEXT: lw a4, 360(sp) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: sw a1, 108(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: sw a1, 100(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a2, 364(sp) +; RV32ZVE32F-NEXT: lw s11, 0(a4) +; RV32ZVE32F-NEXT: lw s9, 4(a4) +; RV32ZVE32F-NEXT: lw a1, 368(sp) +; RV32ZVE32F-NEXT: lw s7, 0(a2) +; RV32ZVE32F-NEXT: lw s5, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 372(sp) +; RV32ZVE32F-NEXT: lw s3, 0(a1) +; RV32ZVE32F-NEXT: lw t6, 4(a1) +; RV32ZVE32F-NEXT: lw a2, 376(sp) +; RV32ZVE32F-NEXT: lw t4, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 380(sp) +; RV32ZVE32F-NEXT: lw t1, 4(a3) +; RV32ZVE32F-NEXT: lw a4, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a5, 196(a0) +; RV32ZVE32F-NEXT: sw a6, 192(a0) +; RV32ZVE32F-NEXT: sw a7, 188(a0) +; RV32ZVE32F-NEXT: sw t0, 184(a0) +; RV32ZVE32F-NEXT: sw t2, 180(a0) +; RV32ZVE32F-NEXT: sw t3, 176(a0) +; RV32ZVE32F-NEXT: sw t5, 172(a0) +; RV32ZVE32F-NEXT: sw s2, 168(a0) +; RV32ZVE32F-NEXT: sw s4, 164(a0) +; RV32ZVE32F-NEXT: sw s6, 160(a0) +; RV32ZVE32F-NEXT: sw s8, 156(a0) +; RV32ZVE32F-NEXT: sw s10, 152(a0) +; RV32ZVE32F-NEXT: sw ra, 148(a0) +; RV32ZVE32F-NEXT: lw a5, 104(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 144(a0) +; RV32ZVE32F-NEXT: lw a5, 112(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 140(a0) +; RV32ZVE32F-NEXT: lw a5, 116(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 136(a0) +; RV32ZVE32F-NEXT: lw a5, 120(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 132(a0) +; RV32ZVE32F-NEXT: lw a5, 124(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 128(a0) +; RV32ZVE32F-NEXT: lw a5, 128(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 124(a0) +; RV32ZVE32F-NEXT: lw a5, 132(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 120(a0) +; RV32ZVE32F-NEXT: lw a5, 136(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 116(a0) +; RV32ZVE32F-NEXT: lw a5, 140(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 112(a0) +; RV32ZVE32F-NEXT: lw a5, 144(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 108(a0) +; RV32ZVE32F-NEXT: lw a5, 148(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 104(a0) +; RV32ZVE32F-NEXT: lw a5, 152(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 100(a0) +; RV32ZVE32F-NEXT: lw a5, 156(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 96(a0) +; RV32ZVE32F-NEXT: lw a5, 160(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 92(a0) +; RV32ZVE32F-NEXT: lw a5, 164(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 88(a0) +; RV32ZVE32F-NEXT: lw a5, 168(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 84(a0) +; RV32ZVE32F-NEXT: lw a5, 172(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 80(a0) +; RV32ZVE32F-NEXT: lw a5, 176(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 76(a0) +; RV32ZVE32F-NEXT: lw a5, 180(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 72(a0) +; RV32ZVE32F-NEXT: lw a5, 184(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 68(a0) +; RV32ZVE32F-NEXT: lw a5, 188(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 64(a0) +; RV32ZVE32F-NEXT: lw a5, 208(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 4(a0) +; RV32ZVE32F-NEXT: lw a5, 216(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 252(a0) +; RV32ZVE32F-NEXT: sw a2, 248(a0) +; RV32ZVE32F-NEXT: sw a3, 244(a0) +; RV32ZVE32F-NEXT: sw a4, 240(a0) +; RV32ZVE32F-NEXT: sw t1, 236(a0) +; RV32ZVE32F-NEXT: sw t4, 232(a0) +; RV32ZVE32F-NEXT: sw t6, 228(a0) +; RV32ZVE32F-NEXT: sw s3, 224(a0) +; RV32ZVE32F-NEXT: sw s5, 220(a0) +; RV32ZVE32F-NEXT: sw s7, 216(a0) +; RV32ZVE32F-NEXT: sw s9, 212(a0) +; RV32ZVE32F-NEXT: sw s11, 208(a0) +; RV32ZVE32F-NEXT: lw a1, 100(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 204(a0) +; RV32ZVE32F-NEXT: lw a1, 108(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 200(a0) +; RV32ZVE32F-NEXT: lw a1, 220(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 28(a0) +; RV32ZVE32F-NEXT: lw a1, 228(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 24(a0) +; RV32ZVE32F-NEXT: lw a1, 236(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 20(a0) +; RV32ZVE32F-NEXT: lw a1, 244(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 16(a0) +; RV32ZVE32F-NEXT: lw a1, 248(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 12(a0) +; RV32ZVE32F-NEXT: lw a1, 252(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 8(a0) +; RV32ZVE32F-NEXT: lw a1, 192(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 60(a0) +; RV32ZVE32F-NEXT: lw a1, 196(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 56(a0) +; RV32ZVE32F-NEXT: lw a1, 200(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 52(a0) +; RV32ZVE32F-NEXT: lw a1, 204(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 48(a0) +; RV32ZVE32F-NEXT: lw a1, 212(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 44(a0) +; RV32ZVE32F-NEXT: lw a1, 224(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 40(a0) +; RV32ZVE32F-NEXT: lw a1, 232(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 36(a0) +; RV32ZVE32F-NEXT: lw a1, 240(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 32(a0) +; RV32ZVE32F-NEXT: addi sp, s0, -512 +; RV32ZVE32F-NEXT: lw ra, 508(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 504(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 500(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 496(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s4, 492(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s5, 488(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s6, 484(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s7, 480(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s8, 476(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s9, 472(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s10, 468(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s11, 464(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 512 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_strided_split: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -144 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 144 +; RV64ZVE32F-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s0, 128(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s1, 120(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s2, 112(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s3, 104(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s4, 96(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s5, 88(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s6, 80(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s7, 72(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s8, 64(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s9, 56(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s10, 48(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s11, 40(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: .cfi_offset ra, -8 +; RV64ZVE32F-NEXT: .cfi_offset s0, -16 +; RV64ZVE32F-NEXT: .cfi_offset s1, -24 +; RV64ZVE32F-NEXT: .cfi_offset s2, -32 +; RV64ZVE32F-NEXT: .cfi_offset s3, -40 +; RV64ZVE32F-NEXT: .cfi_offset s4, -48 +; RV64ZVE32F-NEXT: .cfi_offset s5, -56 +; RV64ZVE32F-NEXT: .cfi_offset s6, -64 +; RV64ZVE32F-NEXT: .cfi_offset s7, -72 +; RV64ZVE32F-NEXT: .cfi_offset s8, -80 +; RV64ZVE32F-NEXT: .cfi_offset s9, -88 +; RV64ZVE32F-NEXT: .cfi_offset s10, -96 +; RV64ZVE32F-NEXT: .cfi_offset s11, -104 +; RV64ZVE32F-NEXT: ld a2, 0(a1) +; RV64ZVE32F-NEXT: sd a2, 32(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 16(a1) +; RV64ZVE32F-NEXT: sd a2, 24(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 32(a1) +; RV64ZVE32F-NEXT: sd a2, 16(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 48(a1) +; RV64ZVE32F-NEXT: sd a2, 8(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a2, 64(a1) +; RV64ZVE32F-NEXT: sd a2, 0(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: ld a7, 80(a1) +; RV64ZVE32F-NEXT: ld t0, 96(a1) +; RV64ZVE32F-NEXT: ld t1, 112(a1) +; RV64ZVE32F-NEXT: ld t2, 128(a1) +; RV64ZVE32F-NEXT: ld t3, 144(a1) +; RV64ZVE32F-NEXT: ld t4, 160(a1) +; RV64ZVE32F-NEXT: ld t5, 176(a1) +; RV64ZVE32F-NEXT: ld t6, 192(a1) +; RV64ZVE32F-NEXT: ld s0, 208(a1) +; RV64ZVE32F-NEXT: ld s1, 224(a1) +; RV64ZVE32F-NEXT: ld s2, 240(a1) +; RV64ZVE32F-NEXT: ld s3, 256(a1) +; RV64ZVE32F-NEXT: ld s4, 272(a1) +; RV64ZVE32F-NEXT: ld s5, 288(a1) +; RV64ZVE32F-NEXT: ld s6, 304(a1) +; RV64ZVE32F-NEXT: ld s7, 320(a1) +; RV64ZVE32F-NEXT: ld s8, 336(a1) +; RV64ZVE32F-NEXT: ld s9, 352(a1) +; RV64ZVE32F-NEXT: ld s10, 368(a1) +; RV64ZVE32F-NEXT: ld s11, 384(a1) +; RV64ZVE32F-NEXT: ld ra, 400(a1) +; RV64ZVE32F-NEXT: ld a6, 416(a1) +; RV64ZVE32F-NEXT: ld a5, 432(a1) +; RV64ZVE32F-NEXT: ld a2, 496(a1) +; RV64ZVE32F-NEXT: ld a3, 480(a1) +; RV64ZVE32F-NEXT: ld a4, 464(a1) +; RV64ZVE32F-NEXT: ld a1, 448(a1) +; RV64ZVE32F-NEXT: sd a2, 248(a0) +; RV64ZVE32F-NEXT: sd a3, 240(a0) +; RV64ZVE32F-NEXT: sd a4, 232(a0) +; RV64ZVE32F-NEXT: sd a1, 224(a0) +; RV64ZVE32F-NEXT: sd a5, 216(a0) +; RV64ZVE32F-NEXT: sd a6, 208(a0) +; RV64ZVE32F-NEXT: sd ra, 200(a0) +; RV64ZVE32F-NEXT: sd s11, 192(a0) +; RV64ZVE32F-NEXT: sd s10, 184(a0) +; RV64ZVE32F-NEXT: sd s9, 176(a0) +; RV64ZVE32F-NEXT: sd s8, 168(a0) +; RV64ZVE32F-NEXT: sd s7, 160(a0) +; RV64ZVE32F-NEXT: sd s6, 152(a0) +; RV64ZVE32F-NEXT: sd s5, 144(a0) +; RV64ZVE32F-NEXT: sd s4, 136(a0) +; RV64ZVE32F-NEXT: sd s3, 128(a0) +; RV64ZVE32F-NEXT: sd s2, 120(a0) +; RV64ZVE32F-NEXT: sd s1, 112(a0) +; RV64ZVE32F-NEXT: sd s0, 104(a0) +; RV64ZVE32F-NEXT: sd t6, 96(a0) +; RV64ZVE32F-NEXT: sd t5, 88(a0) +; RV64ZVE32F-NEXT: sd t4, 80(a0) +; RV64ZVE32F-NEXT: sd t3, 72(a0) +; RV64ZVE32F-NEXT: sd t2, 64(a0) +; RV64ZVE32F-NEXT: sd t1, 56(a0) +; RV64ZVE32F-NEXT: sd t0, 48(a0) +; RV64ZVE32F-NEXT: sd a7, 40(a0) +; RV64ZVE32F-NEXT: ld a1, 0(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 32(a0) +; RV64ZVE32F-NEXT: ld a1, 8(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 24(a0) +; RV64ZVE32F-NEXT: ld a1, 16(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 8(a0) +; RV64ZVE32F-NEXT: ld a1, 32(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: sd a1, 0(a0) +; RV64ZVE32F-NEXT: ld ra, 136(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s0, 128(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s1, 120(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s2, 112(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s3, 104(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s4, 96(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s5, 88(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s6, 80(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s7, 72(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s8, 64(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s9, 56(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s10, 48(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s11, 40(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: addi sp, sp, 144 +; RV64ZVE32F-NEXT: ret + %ptrs = getelementptr inbounds i64, ptr %base, <32 x i64> + %x = call <32 x i64> @llvm.masked.gather.v32i64.v32p0(<32 x ptr> %ptrs, i32 8, <32 x i1> shufflevector(<32 x i1> insertelement(<32 x i1> poison, i1 true, i32 0), <32 x i1> poison, <32 x i32> zeroinitializer), <32 x i64> poison) + ret <32 x i64> %x +} + +define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { +; RV32V-LABEL: masked_gather_widen_sew_negative_stride: +; RV32V: # %bb.0: +; RV32V-NEXT: addi a0, a0, 136 +; RV32V-NEXT: li a1, -136 +; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32V-NEXT: vlse64.v v8, (a0), a1 +; RV32V-NEXT: ret +; +; RV64V-LABEL: masked_gather_widen_sew_negative_stride: +; RV64V: # %bb.0: +; RV64V-NEXT: addi a0, a0, 136 +; RV64V-NEXT: li a1, -136 +; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64V-NEXT: vlse64.v v8, (a0), a1 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: masked_gather_widen_sew_negative_stride: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: lui a1, 16393 +; RV32ZVE32F-NEXT: addi a1, a1, -888 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32ZVE32F-NEXT: vmv.s.x v9, a1 +; RV32ZVE32F-NEXT: vluxei8.v v8, (a0), v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: masked_gather_widen_sew_negative_stride: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi a1, a0, 136 +; RV64ZVE32F-NEXT: lw a2, 140(a0) +; RV64ZVE32F-NEXT: lw a3, 0(a0) +; RV64ZVE32F-NEXT: lw a0, 4(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vlse32.v v8, (a1), zero +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: ret + %ptrs = getelementptr i32, ptr %base, <4 x i64> + %x = call <4 x i32> @llvm.masked.gather.v4i32.v32p0(<4 x ptr> %ptrs, i32 8, <4 x i1> shufflevector(<4 x i1> insertelement(<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer), <4 x i32> poison) + ret <4 x i32> %x +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index f53b51e05c5726..3f0bdb9d5e3166 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -138,8 +138,8 @@ define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range ret <4 x i64> %res } -define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { -; CHECK-LABEL: m2_splat_into_identity_two_source: +define <4 x i64> @m2_splat_into_identity_two_source_v2_hi(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { +; CHECK-LABEL: m2_splat_into_identity_two_source_v2_hi: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vrgather.vi v10, v8, 0 @@ -149,6 +149,18 @@ define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2 ret <4 x i64> %res } +define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { +; CHECK-LABEL: m2_splat_into_slide_two_source_v2_lo: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vi v12, v8, 0 +; CHECK-NEXT: vmv1r.v v13, v10 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: ret + %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> + ret <4 x i64> %res +} + define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_into_slide_two_source: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll index a34fa9502d93b3..d0777962a75651 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll @@ -8,11 +8,13 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: trn1.v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 @@ -22,11 +24,13 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: trn2.v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 @@ -36,14 +40,16 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: trn1.v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vadd.vi v8, v11, -1 ; CHECK-NEXT: lui a0, 11 ; CHECK-NEXT: addi a0, a0, -1366 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 @@ -53,14 +59,16 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: trn2.v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: lui a0, 11 ; CHECK-NEXT: addi a0, a0, -1366 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 @@ -70,10 +78,12 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: trn1.v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 @@ -83,10 +93,12 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: trn2.v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 @@ -96,11 +108,13 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: trn1.v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 @@ -110,11 +124,13 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: trn2.v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 @@ -147,10 +163,12 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: trn1.v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 @@ -160,10 +178,12 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: trn2.v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 @@ -219,10 +239,12 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: trn1.v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 @@ -232,10 +254,12 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: trn2.v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 @@ -268,10 +292,12 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: trn1.v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 @@ -281,10 +307,12 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: trn2.v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 @@ -294,11 +322,13 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: trn1.v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0 @@ -308,11 +338,13 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: trn2.v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0 diff --git a/llvm/test/CodeGen/RISCV/select-binop-identity.ll b/llvm/test/CodeGen/RISCV/select-binop-identity.ll index f45d67164d640d..83bb7f19fa2b05 100644 --- a/llvm/test/CodeGen/RISCV/select-binop-identity.ll +++ b/llvm/test/CodeGen/RISCV/select-binop-identity.ll @@ -7,9 +7,9 @@ ; RUN: | FileCheck -check-prefix=SFB64 %s ; RUN: llc -mtriple=riscv64 -mattr=+xventanacondops -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=VTCONDOPS64 %s -; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+zicond -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=RV32,ZICOND,ZICOND32 %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+zicond -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=ZICOND,ZICOND64 %s ; InstCombine canonicalizes (c ? x | y : x) to (x | (c ? y : 0)) similar for diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll index 7fa27a307757d0..7dd223df5e557e 100644 --- a/llvm/test/CodeGen/RISCV/select.ll +++ b/llvm/test/CodeGen/RISCV/select.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32IM %s ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64IM %s ; RUN: llc -mtriple=riscv64 -mattr=+m,+xventanacondops -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64IMXVTCONDOPS %s -; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV32IMZICOND %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV64IMZICOND %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV32IMZICOND %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV64IMZICOND %s define i16 @select_xor_1(i16 %A, i8 %cond) { ; RV32IM-LABEL: select_xor_1: diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll index d2ddbe99000ed0..87406f22d169d1 100644 --- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck -check-prefix=NOSFB %s ; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -mattr=+zbb -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=SFB,NOZICOND,RV64SFB %s -; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -mattr=+experimental-zicond,+zbb \ +; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -mattr=+zicond,+zbb \ ; RUN: -verify-machineinstrs < %s | FileCheck -check-prefixes=SFB,ZICOND %s ; RUN: llc -mtriple=riscv32 -mcpu=sifive-e76 -mattr=+zbb -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=SFB,NOZICOND,RV32SFB %s diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index f878d17d5f1da6..ac67c0769f7056 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -3,8 +3,8 @@ ; RUN: llc < %s -mtriple=riscv64 -mattr=+m -verify-machineinstrs | FileCheck %s --check-prefix=RV64 ; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zba -verify-machineinstrs | FileCheck %s --check-prefix=RV32ZBA ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs | FileCheck %s --check-prefix=RV64ZBA -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+experimental-zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV32ZICOND -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV64ZICOND +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV32ZICOND +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV64ZICOND ; ; Get the actual value of the overflow bit. diff --git a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll index 5bf2adbeb75c95..07eb67df6e5f7e 100644 --- a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll +++ b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll @@ -11,3 +11,12 @@ entry: tail call void asm sideeffect "faddq $0,$1,$2", "{f38},{f0},{f0}"(fp128 0xL0, fp128 0xL0, fp128 0xL0) ret void } + +; CHECK-label:test_twinword_error +; CHECK: error: Hi part of pair should point to an even-numbered register +; CHECK: error: (note that in some cases it might be necessary to manually bind the input/output registers instead of relying on automatic allocation) + +define i64 @test_twinword_error(){ + %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i1}"() + ret i64 %1 +} diff --git a/llvm/test/CodeGen/SPARC/inlineasm.ll b/llvm/test/CodeGen/SPARC/inlineasm.ll index 8bf34bf1609c18..efb7f7c15220c2 100644 --- a/llvm/test/CodeGen/SPARC/inlineasm.ll +++ b/llvm/test/CodeGen/SPARC/inlineasm.ll @@ -143,3 +143,12 @@ entry: %1 = call double asm sideeffect "faddd $1, $2, $0", "=f,f,e"(i64 0, i64 0) ret void } + +; CHECK-label:test_twinword +; CHECK: rd %asr5, %i1 +; CHECK: srlx %i1, 32, %i0 + +define i64 @test_twinword(){ + %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i0}"() + ret i64 %1 +} diff --git a/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll new file mode 100644 index 00000000000000..3587ecb7f3c94a --- /dev/null +++ b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll @@ -0,0 +1,25 @@ +;; Test reserving argument registers. +; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0 +; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0 +; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0 +; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0 + +; CHECK-RESERVED-O0: error: +; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +; CHECK-RESERVED-I0: error: +; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +define void @call_function() { + call void @foo() + ret void +} +declare void @foo() + +; CHECK-RESERVED-O0: error: +; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +; CHECK-RESERVED-I0: error: +; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved. +define void @call_function_with_arg(i8 %in) { + call void @bar(i8 %in) + ret void +} +declare void @bar(i8) diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-named.ll b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll new file mode 100644 index 00000000000000..91808be156c559 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l0 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L0 + +;; Ensure explicit register references are catched as well. + +; CHECK-RESERVED-L0: %l0 +define void @set_reg(i32 zeroext %x) { +entry: + tail call void @llvm.write_register.i32(metadata !0, i32 %x) + ret void +} + +declare void @llvm.write_register.i32(metadata, i32) +!0 = !{!"l0"} diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll new file mode 100644 index 00000000000000..53ca045f100443 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll @@ -0,0 +1,14 @@ +; RUN: not --crash llc -mtriple=sparc64-linux-gnu -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-L0 + +;; Ensure explicit register references for non-reserved registers +;; are caught properly. + +; CHECK-RESERVED-L0: LLVM ERROR: Invalid register name global variable +define void @set_reg(i32 zeroext %x) { +entry: + tail call void @llvm.write_register.i32(metadata !0, i32 %x) + ret void +} + +declare void @llvm.write_register.i32(metadata, i32) +!0 = !{!"l0"} diff --git a/llvm/test/CodeGen/SPARC/reserved-regs.ll b/llvm/test/CodeGen/SPARC/reserved-regs.ll index ec6290586eeef2..7dea1f31538b84 100644 --- a/llvm/test/CodeGen/SPARC/reserved-regs.ll +++ b/llvm/test/CodeGen/SPARC/reserved-regs.ll @@ -1,5 +1,14 @@ ; RUN: llc -march=sparc -verify-machineinstrs < %s | FileCheck %s +;; Test reserve-* options. +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1 +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-o1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-O1 +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L1 +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-I1 + +;; Test multiple reserve-* options together. +; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -mattr=+reserve-o1 -mattr=+reserve-l1 -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1,CHECK-RESERVED-O1,CHECK-RESERVED-L1,CHECK-RESERVED-I1 + @g = common global [32 x i32] zeroinitializer, align 16 @h = common global [16 x i64] zeroinitializer, align 16 @@ -16,6 +25,10 @@ ; CHECK-NOT: %o6 ; CHECK-NOT: %i6 ; CHECK-NOT: %i7 +; CHECK-RESERVED-G1-NOT: %g1 +; CHECK-RESERVED-O1-NOT: %o1 +; CHECK-RESERVED-L1-NOT: %l1 +; CHECK-RESERVED-I1-NOT: %i1 ; CHECK: ret define void @use_all_i32_regs() { entry: @@ -100,6 +113,10 @@ entry: ; CHECK-NOT: %o7 ; CHECK-NOT: %i6 ; CHECK-NOT: %i7 +; CHECK-RESERVED-G1-NOT: %g1 +; CHECK-RESERVED-O1-NOT: %o1 +; CHECK-RESERVED-L1-NOT: %l1 +; CHECK-RESERVED-I1-NOT: %i1 ; CHECK: ret define void @use_all_i64_regs() { entry: diff --git a/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir b/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir index 5e100b88ead300..82e3bae97ec0c4 100644 --- a/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir +++ b/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -verify-machineinstrs -O1 -mtriple=s390x-ibm-linux -o - %s -run-pass=branch-folder | FileCheck %s --- | target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64" @@ -15,6 +16,30 @@ name: f1 tracksRegLiveness: true body: | + ; CHECK-LABEL: name: f1 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x7fffffff), %bb.1(0x00000001) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r1d = LGRL @b :: (load (s32) from got, align 8) + ; CHECK-NEXT: renamable $r1l = LH killed renamable $r1d, 0, $noreg, implicit-def $r1d :: (dereferenceable load (s8) from @b) + ; CHECK-NEXT: renamable $r2l = LHI 0 + ; CHECK-NEXT: renamable $r3d = LGRL @d :: (load (s32) from got, align 8) + ; CHECK-NEXT: renamable $r4d = LLILL 0, implicit-def $r4q + ; CHECK-NEXT: renamable $r4d = COPY killed renamable $r4d, implicit killed $r4q + ; CHECK-NEXT: CHI killed renamable $r2l, 0, implicit-def $cc + ; CHECK-NEXT: BRC 14, 6, %bb.2, implicit killed $cc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: + ; CHECK-NEXT: liveins: $r3d, $r4d, $r1l + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: STH renamable $r1l, killed renamable $r3d, 0, $noreg, implicit killed $r4d :: (store (s8) into @d) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $r3d, $r4d, $r1l + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: STH renamable $r1l, killed renamable $r3d, 0, $noreg, implicit killed $r4d :: (store (s8) into @d) + ; CHECK-NEXT: Return bb.0: successors: %bb.2(0x7fffffff), %bb.1(0x00000001) liveins: @@ -44,14 +69,3 @@ body: | Return ... - -# CHECK: renamable $r4d = COPY killed renamable $r4d, implicit killed $r4q -# CHECK-NEXT: CHI killed renamable $r2l, 0, implicit-def $cc -# CHECK-NEXT: BRC 14, 6, %bb.2, implicit killed $cc -# CHECK-NEXT: {{^ $}} -# CHECK-NEXT: bb.1: -# CHECK-NEXT: successors: -# CHECK-NEXT: liveins: $r1l, $r3d, $r4d - -# CHECK: bb.2: -# CHECK-NEXT: liveins: $r1l, $r3d, $r4d diff --git a/llvm/test/CodeGen/SystemZ/frame-29.ll b/llvm/test/CodeGen/SystemZ/frame-29.ll new file mode 100644 index 00000000000000..6cc0d9e985e160 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/frame-29.ll @@ -0,0 +1,18 @@ +; RUN: llc %s -o - -mtriple=s390x-linux-gnu -mcpu=z16 -print-after=finalize-isel 2>&1 | FileCheck %s +; +; Test that the correct space is allocated for the outgoing stack argument. + +declare void @bar(i72 %Arg); + +define void @foo() { +; CHECK-LABEL: # Machine code for function foo: IsSSA, TracksLiveness +; CHECK-NEXT: Frame Objects: +; CHECK-NEXT: fi#0: size=1, align=2, at location [SP] +; CHECK-NEXT: fi#1: size=16, align=8, at location [SP] + +; CHECK-LABEL: foo: +; CHECK: aghi %r15, -184 + %1 = alloca i8, align 2 + tail call fastcc void @bar(i72 2097168) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/int-usub-12.ll b/llvm/test/CodeGen/SystemZ/int-usub-12.ll index c39a6da37048d3..147fbfd920a9dc 100644 --- a/llvm/test/CodeGen/SystemZ/int-usub-12.ll +++ b/llvm/test/CodeGen/SystemZ/int-usub-12.ll @@ -11,6 +11,7 @@ define zeroext i1 @f1(i128 %a, i128 %b, ptr %res) { ; CHECK-NEXT: vscbiq %v2, %v1, %v0 ; CHECK-NEXT: vlgvg %r2, %v2, 1 ; CHECK-NEXT: vsq %v0, %v1, %v0 +; CHECK-NEXT: xilf %r2, 1 ; CHECK-NEXT: vst %v0, 0(%r4), 3 ; CHECK-NEXT: br %r14 %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b) @@ -27,6 +28,7 @@ define zeroext i1 @f2(i128 %a, i128 %b) { ; CHECK-NEXT: vl %v1, 0(%r2), 3 ; CHECK-NEXT: vscbiq %v0, %v1, %v0 ; CHECK-NEXT: vlgvg %r2, %v0, 1 +; CHECK-NEXT: xilf %r2, 1 ; CHECK-NEXT: br %r14 %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b) %obit = extractvalue {i128, i1} %t, 1 @@ -46,5 +48,25 @@ define i128 @f3(i128 %a, i128 %b) { ret i128 %val } +define i128 @f4(i128 %a, i128 %b) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vscbiq %v2, %v1, %v0 +; CHECK-NEXT: vlgvf %r0, %v2, 3 +; CHECK-NEXT: vgbm %v2, 0 +; CHECK-NEXT: xilf %r0, 1 +; CHECK-NEXT: jl .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vsq %v2, %v1, %v0 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: vst %v2, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %val = call i128 @llvm.usub.sat.i128(i128 %a, i128 %b) + ret i128 %val +} + declare {i128, i1} @llvm.usub.with.overflow.i128(i128, i128) nounwind readnone +declare i128 @llvm.usub.sat.i128(i128, i128) nounwind readnone diff --git a/llvm/test/CodeGen/SystemZ/int-usub-13.ll b/llvm/test/CodeGen/SystemZ/int-usub-13.ll index 637e1a81de996f..794af3b73fbe2a 100644 --- a/llvm/test/CodeGen/SystemZ/int-usub-13.ll +++ b/llvm/test/CodeGen/SystemZ/int-usub-13.ll @@ -15,6 +15,7 @@ define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) { ; CHECK-NEXT: vlgvg %r2, %v5, 1 ; CHECK-NEXT: vsbiq %v0, %v1, %v0, %v4 ; CHECK-NEXT: vsq %v1, %v3, %v2 +; CHECK-NEXT: xilf %r2, 1 ; CHECK-NEXT: vst %v1, 16(%r4), 3 ; CHECK-NEXT: vst %v0, 0(%r4), 3 ; CHECK-NEXT: br %r14 @@ -35,6 +36,7 @@ define zeroext i1 @f2(i256 %a, i256 %b) { ; CHECK-NEXT: vscbiq %v2, %v3, %v2 ; CHECK-NEXT: vsbcbiq %v0, %v1, %v0, %v2 ; CHECK-NEXT: vlgvg %r2, %v0, 1 +; CHECK-NEXT: xilf %r2, 1 ; CHECK-NEXT: br %r14 %t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b) %obit = extractvalue {i256, i1} %t, 1 diff --git a/llvm/test/CodeGen/SystemZ/loop-01.ll b/llvm/test/CodeGen/SystemZ/loop-01.ll index 15dfae73c97bc9..554c248f8dbf3b 100644 --- a/llvm/test/CodeGen/SystemZ/loop-01.ll +++ b/llvm/test/CodeGen/SystemZ/loop-01.ll @@ -312,3 +312,26 @@ for.inc.i: ; preds = %for.body.i63 %indvars.iv.next156.i.3 = add nsw i64 %indvars.iv155.i, 4 br label %for.body.i63 } + +; Test that offsets are in range for i128 memory accesses. +define void @fun10() { +; CHECK-Z13-LABEL: fun10: +; CHECK-Z13: # =>This Inner Loop Header: Depth=1 +; CHECK-Z13-NOT: lay +entry: + %A1 = alloca [3 x [7 x [10 x i128]]], align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %IV = phi i64 [ 0, %entry ], [ %IV.next, %for.body ] + %Addr1 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 6 + store i128 17174966165894859678, ptr %Addr1, align 8 + %Addr2 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 8 + store i128 17174966165894859678, ptr %Addr2, align 8 + %IV.next = add nuw nsw i64 %IV, 1 + %exitcond.not.i.i = icmp eq i64 %IV.next, 3 + br i1 %exitcond.not.i.i, label %exit, label %for.body + +exit: ; preds = %for.body + unreachable +} diff --git a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll index f54f654b804a23..60580aeb6d83cc 100644 --- a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll +++ b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll @@ -24,7 +24,7 @@ ; CHECK: .byte 0 ; CHECK: .byte 3 ; CHECK: .short 30 -; CHECK: .ascii "\323\323\345\324@@@@@@\361\370\360\360\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360" +; CHECK: .ascii "\323\323\345\324@@@@@@{{((\\3[0-7]{2}){4})}}\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360" define void @void_test() { entry: ret void diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir index 1bd1d6b99e4225..15aa62d5cff6b5 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir @@ -336,7 +336,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x04000000), %bb.2(0x7c000000) - ; CHECK-NEXT: liveins: $q0, $r0, $r1, $r2, $r3, $r6, $r12 + ; CHECK-NEXT: liveins: $d0, $d1, $r0, $r1, $r2, $r3, $r6, $r12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $r3, dead $cpsr = nuw nsw tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $r0 = tADDhirr killed renamable $r0, renamable $r1, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll new file mode 100644 index 00000000000000..0ce346f7c2e02c --- /dev/null +++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s + +define i32 @h(i1 %arg, i32 %arg1) { +; CHECK-LABEL: h: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: movabsq $9166129423, %rcx # imm = 0x22258090F +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: jmp .LBB0_1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_9: # %bb18 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB0_10 +; CHECK-NEXT: .LBB0_1: # %bb4 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.7: # %bb16 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB0_9 +; CHECK-NEXT: # %bb.8: # %bb17 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: jmp .LBB0_9 +; CHECK-NEXT: .LBB0_2: # %bb9 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.3: # %bb13 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .LBB0_4: # %bb14 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: cmpl $1, %esi +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.5: # %bb14 +; CHECK-NEXT: movl %eax, %r8d +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: jne .LBB0_6 +; CHECK-NEXT: .LBB0_10: # %bb22 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_6: # %bb22.loopexit1 +; CHECK-NEXT: movl %r8d, %eax +; CHECK-NEXT: retq +bb: + br label %bb2 + +bb2: ; preds = %bb14, %bb + %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ] + %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ] + br label %bb4 + +bb4: ; preds = %bb18, %bb2 + %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ] + %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ] + %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ] + %i8 = icmp eq i64 %i6, 0 + br i1 %i8, label %bb16, label %bb9 + +bb9: ; preds = %bb4 + br i1 %arg, label %bb12, label %bb10 + +bb10: ; preds = %bb9 + %i11 = sdiv i64 0, 0 + br label %bb12 + +bb12: ; preds = %bb10, %bb9 + br i1 %arg, label %bb13, label %bb14 + +bb13: ; preds = %bb12 + br label %bb14 + +bb14: ; preds = %bb13, %bb12 + %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ] + switch i32 %arg1, label %bb22 [ + i32 0, label %bb21 + i32 1, label %bb2 + ] + +bb16: ; preds = %bb4 + br i1 %arg, label %bb18, label %bb17 + +bb17: ; preds = %bb16 + br label %bb18 + +bb18: ; preds = %bb17, %bb16 + %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ] + %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ] + br i1 %arg, label %bb22, label %bb4 + +bb21: ; preds = %bb14 + br label %bb22 + +bb22: ; preds = %bb21, %bb18, %bb14 + %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ] + ret i32 %i23 +} diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll index 3fc4ed99fad0fa..f8d32fc2d29252 100644 --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -1490,3 +1490,26 @@ define { i64, i64 } @addcarry_commutative_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1) %r1 = insertvalue { i64, i64 } %r0, i64 %b1s, 1 ret { i64, i64 } %r1 } + +define i1 @pr84831(i64 %arg) { +; CHECK-LABEL: pr84831: +; CHECK: # %bb.0: +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: setne %al +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: addb $-1, %al +; CHECK-NEXT: adcq $1, %rcx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retq + %a = icmp ult i64 0, %arg + %add1 = add i64 0, 1 + %carryout1 = icmp ult i64 %add1, 0 + %b = zext i1 %a to i64 + %add2 = add i64 %add1, %b + %carryout2 = icmp ult i64 %add2, %add1 + %zc1 = zext i1 %carryout1 to i63 + %zc2 = zext i1 %carryout2 to i63 + %or = or i63 %zc1, %zc2 + %trunc = trunc i63 %or to i1 + ret i1 %trunc +} diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll index 0826faa1071b01..482713e12d15c7 100644 --- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll @@ -381,3 +381,25 @@ entry: %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer ret <16 x bfloat> %1 } + +define <16 x i32> @pr83358() { +; X86-LABEL: pr83358: +; X86: # %bb.0: +; X86-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00] +; X86-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: pr83358: +; X64: # %bb.0: +; X64-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00] +; X64-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> ) + %2 = bitcast <8 x bfloat> %1 to <4 x i32> + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> + ret <16 x i32> %3 +} diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll index 4988fc35b10eef..fdc25f44b156a7 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX256 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX512 ; 256-bit @@ -236,3 +236,34 @@ define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) { ret <8 x i16> %x } +define i16 @PR90356(<16 x i1> %a) { +; EVEX256-LABEL: PR90356: +; EVEX256: # %bb.0: +; EVEX256-NEXT: vpsllw $7, %xmm0, %xmm0 +; EVEX256-NEXT: vpmovb2m %xmm0, %k1 +; EVEX256-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; EVEX256-NEXT: movb $63, %al +; EVEX256-NEXT: kmovd %eax, %k1 +; EVEX256-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; EVEX256-NEXT: vptestmd %zmm0, %zmm0, %k0 +; EVEX256-NEXT: kmovd %k0, %eax +; EVEX256-NEXT: # kill: def $ax killed $ax killed $eax +; EVEX256-NEXT: vzeroupper +; EVEX256-NEXT: retq +; +; EVEX512-LABEL: PR90356: +; EVEX512: # %bb.0: +; EVEX512-NEXT: vpsllw $7, %xmm0, %xmm0 +; EVEX512-NEXT: vpmovb2m %xmm0, %k0 +; EVEX512-NEXT: vpmovm2w %k0, %ymm0 +; EVEX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; EVEX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; EVEX512-NEXT: vpmovw2m %ymm0, %k0 +; EVEX512-NEXT: kmovd %k0, %eax +; EVEX512-NEXT: # kill: def $ax killed $ax killed $eax +; EVEX512-NEXT: vzeroupper +; EVEX512-NEXT: retq + %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index f2d3c4fb34199e..cd1dba17611628 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -2423,7 +2423,6 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { ; AVXNC-LABEL: fptrunc_v16f32: ; AVXNC: # %bb.0: ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0 -; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1 ; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVXNC-NEXT: retq diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll index afcffb3a7adeda..b6634403dc1d05 100644 --- a/llvm/test/CodeGen/X86/code-model-elf.ll +++ b/llvm/test/CodeGen/X86/code-model-elf.ll @@ -346,7 +346,7 @@ define dso_local ptr @lea_forced_small_data() #0 { ; ; LARGE-STATIC-LABEL: lea_forced_small_data: ; LARGE-STATIC: # %bb.0: -; LARGE-STATIC-NEXT: movl $forced_small_data, %eax +; LARGE-STATIC-NEXT: movabsq $forced_small_data, %rax ; LARGE-STATIC-NEXT: retq ; ; SMALL-PIC-LABEL: lea_forced_small_data: @@ -399,7 +399,7 @@ define dso_local i32 @load_forced_small_data() #0 { ; ; LARGE-STATIC-LABEL: load_forced_small_data: ; LARGE-STATIC: # %bb.0: -; LARGE-STATIC-NEXT: movl $forced_small_data+8, %eax +; LARGE-STATIC-NEXT: movabsq $forced_small_data+8, %rax ; LARGE-STATIC-NEXT: movl (%rax), %eax ; LARGE-STATIC-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll index d223b75419ac47..294fcd6a9563eb 100644 --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -1171,6 +1171,25 @@ define <4 x i32> @neg_scalar_broadcast_two_uses(i32 %a0, <4 x i32> %a1, ptr %a2) ret <4 x i32> %4 } +; PR84660 - check for illegal types +define <2 x i128> @neg_scalar_broadcast_illegaltype(i128 %arg) { +; CHECK-LABEL: neg_scalar_broadcast_illegaltype: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: notl %esi +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: movq %rsi, 16(%rdi) +; CHECK-NEXT: movq %rsi, (%rdi) +; CHECK-NEXT: movq $0, 24(%rdi) +; CHECK-NEXT: movq $0, 8(%rdi) +; CHECK-NEXT: retq + %i = xor i128 %arg, 1 + %i1 = insertelement <2 x i128> zeroinitializer, i128 %i, i64 0 + %i2 = shufflevector <2 x i128> %i1, <2 x i128> zeroinitializer, <2 x i32> zeroinitializer + %i3 = and <2 x i128> , %i2 + ret <2 x i128> %i3 +} + define <2 x i64> @andnp_xx(<2 x i64> %v0) nounwind { ; SSE-LABEL: andnp_xx: ; SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/commute-blend-avx2.ll b/llvm/test/CodeGen/X86/commute-blend-avx2.ll index b5ffe78d29a610..75511104580e90 100644 --- a/llvm/test/CodeGen/X86/commute-blend-avx2.ll +++ b/llvm/test/CodeGen/X86/commute-blend-avx2.ll @@ -88,3 +88,12 @@ define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, ptr %b) #0 { ret <4 x double> %2 } declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x float> @commute_vblendpd_128_for_code_size(<4 x float> %a, <4 x float> %b) optsize { +; CHECK-LABEL: commute_vblendpd_128_for_code_size: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; CHECK-NEXT: retq + %r = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> + ret <4 x float> %r +} diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll index 7f8115dc1ce389..b5f3e789918813 100644 --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -1282,3 +1282,35 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(ptr %arg) { %tmp8 = or i32 %tmp7, %tmp30 ret i32 %tmp8 } + +define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind { +; CHECK-LABEL: pr80911_vector_load_multiuse: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl (%edx), %esi +; CHECK-NEXT: movzwl (%edx), %eax +; CHECK-NEXT: movl $0, (%ecx) +; CHECK-NEXT: movl %esi, (%edx) +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: pr80911_vector_load_multiuse: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movl (%rdi), %ecx +; CHECK64-NEXT: movzwl (%rdi), %eax +; CHECK64-NEXT: movl $0, (%rsi) +; CHECK64-NEXT: movl %ecx, (%rdi) +; CHECK64-NEXT: retq + %load = load <4 x i8>, ptr %ptr, align 16 + store i32 0, ptr %clobber + store <4 x i8> %load, ptr %ptr, align 16 + %e1 = extractelement <4 x i8> %load, i64 1 + %e1.ext = zext i8 %e1 to i32 + %e1.ext.shift = shl nuw nsw i32 %e1.ext, 8 + %e0 = extractelement <4 x i8> %load, i64 0 + %e0.ext = zext i8 %e0 to i32 + %res = or i32 %e1.ext.shift, %e0.ext + ret i32 %res +} diff --git a/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll b/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll new file mode 100644 index 00000000000000..a0e5b4add1b386 --- /dev/null +++ b/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll @@ -0,0 +1,30 @@ +; RUN: llc -mtriple x86_64-unknown-linux-gnu %s -o %t.o -filetype=obj +; RUN: llvm-readobj -n %t.o | FileCheck %s + +module asm ".pushsection \22.note.gnu.property\22,\22a\22,@note" +module asm " .p2align 3" +module asm " .long 1f - 0f" +module asm " .long 4f - 1f" +module asm " .long 5" +module asm "0: .asciz \22GNU\22" +module asm "1: .p2align 3" +module asm " .long 0xc0008002" +module asm " .long 3f - 2f" +module asm "2: .long ((1U << 0) | 0 | 0 | 0)" +module asm "3: .p2align 3" +module asm "4:" +module asm " .popsection" + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 4, !"cf-protection-return", i32 1} +!1 = !{i32 4, !"cf-protection-branch", i32 1} + +; CHECK: Type: NT_GNU_PROPERTY_TYPE_0 +; CHECK-NEXT: Property [ +; CHECK-NEXT: x86 feature: IBT, SHSTK +; CHECK-NEXT: ] +; CHECK: Type: NT_GNU_PROPERTY_TYPE_0 +; CHECK-NEXT: Property [ +; CHECK-NEXT: x86 ISA needed: x86-64-baseline +; CHECK-NEXT: ] diff --git a/llvm/test/CodeGen/X86/patchable-prologue.ll b/llvm/test/CodeGen/X86/patchable-prologue.ll index 71a392845fdea3..43761e3d1e1eb9 100644 --- a/llvm/test/CodeGen/X86/patchable-prologue.ll +++ b/llvm/test/CodeGen/X86/patchable-prologue.ll @@ -193,3 +193,20 @@ do.body: ; preds = %do.body, %entry do.end: ; preds = %do.body ret void } + + +; Test that inline asm is properly hotpatched. We currently don't examine the +; asm instruction when printing it, thus we always emit patching NOPs. + +; 64: inline_asm: +; 64-NEXT: # %bb.0: +; 64-NEXT: xchgw %ax, %ax # encoding: [0x66,0x90] +; 64-NEXT: #APP +; 64-NEXT: int3 # encoding: [0xcc] +; 64-NEXT: #NO_APP + +define dso_local void @inline_asm() "patchable-function"="prologue-short-redirect" { +entry: + call void asm sideeffect "int3", "~{dirflag},~{fpsr},~{flags}"() + ret void +} diff --git a/llvm/test/CodeGen/X86/sar_fold.ll b/llvm/test/CodeGen/X86/sar_fold.ll index 21655e19440afe..0f1396954b03a1 100644 --- a/llvm/test/CodeGen/X86/sar_fold.ll +++ b/llvm/test/CodeGen/X86/sar_fold.ll @@ -44,3 +44,44 @@ define i32 @shl24sar25(i32 %a) #0 { %2 = ashr exact i32 %1, 25 ret i32 %2 } + +define void @shl144sar48(ptr %p) #0 { +; CHECK-LABEL: shl144sar48: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movswl (%eax), %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: sarl $31, %edx +; CHECK-NEXT: shldl $2, %ecx, %edx +; CHECK-NEXT: shll $2, %ecx +; CHECK-NEXT: movl %ecx, 12(%eax) +; CHECK-NEXT: movl %edx, 16(%eax) +; CHECK-NEXT: movl $0, 8(%eax) +; CHECK-NEXT: movl $0, 4(%eax) +; CHECK-NEXT: movl $0, (%eax) +; CHECK-NEXT: retl + %a = load i160, ptr %p + %1 = shl i160 %a, 144 + %2 = ashr exact i160 %1, 46 + store i160 %2, ptr %p + ret void +} + +define void @shl144sar2(ptr %p) #0 { +; CHECK-LABEL: shl144sar2: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movswl (%eax), %ecx +; CHECK-NEXT: shll $14, %ecx +; CHECK-NEXT: movl %ecx, 16(%eax) +; CHECK-NEXT: movl $0, 8(%eax) +; CHECK-NEXT: movl $0, 12(%eax) +; CHECK-NEXT: movl $0, 4(%eax) +; CHECK-NEXT: movl $0, (%eax) +; CHECK-NEXT: retl + %a = load i160, ptr %p + %1 = shl i160 %a, 144 + %2 = ashr exact i160 %1, 2 + store i160 %2, ptr %p + ret void +} diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll index cf45641fba6321..3316a332fafdff 100644 --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -787,3 +787,38 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32 %r = or <4 x i32> %or.ab, %or.cd ret <4 x i32> %r } + +; Reproducer for a DAGCombiner::combineShiftOfShiftedLogic bug. DAGCombiner +; need to check that the sum of the shift amounts fits in i8, which is the +; legal type used to described X86 shift amounts. Verify that we do not try to +; create a shift with 130+160 as shift amount, and verify that the stored +; value do not depend on %a1. +define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) { +; X86-LABEL: combineShiftOfShiftedLogic: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 20(%ecx) +; X86-NEXT: movl $0, 16(%ecx) +; X86-NEXT: movl $0, 12(%ecx) +; X86-NEXT: movl $0, 8(%ecx) +; X86-NEXT: movl $0, 4(%ecx) +; X86-NEXT: movl $0, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: combineShiftOfShiftedLogic: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edx killed $edx def $rdx +; X64-NEXT: shlq $32, %rdx +; X64-NEXT: movq %rdx, 16(%rcx) +; X64-NEXT: movq $0, 8(%rcx) +; X64-NEXT: movq $0, (%rcx) +; X64-NEXT: retq + %zext1 = zext i128 %a1 to i192 + %zext2 = zext i32 %a2 to i192 + %shl = shl i192 %zext1, 130 + %or = or i192 %shl, %zext2 + %res = shl i192 %or, 160 + store i192 %res, ptr %p, align 8 + ret void +} diff --git a/llvm/test/CodeGen/X86/tls-models.ll b/llvm/test/CodeGen/X86/tls-models.ll index fc8e302338d960..8de9de15a5f06e 100644 --- a/llvm/test/CodeGen/X86/tls-models.ll +++ b/llvm/test/CodeGen/X86/tls-models.ll @@ -5,6 +5,7 @@ ; Darwin always uses the same model. ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=DARWIN %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -code-model=large | FileCheck -check-prefix=DARWIN %s @external_gd = external thread_local global i32 @internal_gd = internal thread_local global i32 42 diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll index e03b4c1d34de15..07443a62b93391 100644 --- a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll +++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll @@ -1,19 +1,22 @@ ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=-relax %s -o %t.o ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-NORL %s -; RUN: llvm-objdump --source %t.o | FileCheck --check-prefix=SOURCE %s -; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefix=DWARF %s +; RUN: llvm-objdump --source %t.o | FileCheck --check-prefixes=SOURCE,SOURCE-NORL %s +; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefixes=DWARF,DWARF-NORL %s ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=+relax %s -o %t.r.o ; RUN: llvm-readobj -r %t.r.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-ENRL %s -; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefix=SOURCE %s -; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefix=DWARF %s +; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefixes=SOURCE,SOURCE-ENRL %s +; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefixes=DWARF,DWARF-ENRL %s ; RELOCS-BOTH: Relocations [ ; RELOCS-BOTH-NEXT: Section ({{.*}}) .rela.text { -; RELOCS-BOTH-NEXT: 0x14 R_LARCH_PCALA_HI20 sym 0x0 -; RELOCS-ENRL-NEXT: 0x14 R_LARCH_RELAX - 0x0 -; RELOCS-BOTH-NEXT: 0x18 R_LARCH_PCALA_LO12 sym 0x0 -; RELOCS-ENRL-NEXT: 0x18 R_LARCH_RELAX - 0x0 +; RELOCS-NORL-NEXT: 0x14 R_LARCH_PCALA_HI20 sym 0x0 +; RELOCS-NORL-NEXT: 0x18 R_LARCH_PCALA_LO12 sym 0x0 +; RELOCS-ENRL-NEXT: 0x0 R_LARCH_ALIGN .Lla-relax-align0 0x5 +; RELOCS-ENRL-NEXT: 0x30 R_LARCH_PCALA_HI20 sym 0x0 +; RELOCS-ENRL-NEXT: 0x30 R_LARCH_RELAX - 0x0 +; RELOCS-ENRL-NEXT: 0x34 R_LARCH_PCALA_LO12 sym 0x0 +; RELOCS-ENRL-NEXT: 0x34 R_LARCH_RELAX - 0x0 ; RELOCS-BOTH-NEXT: } ; RELOCS-BOTH: Section ({{.*}}) .rela.debug_frame { ; RELOCS-NORL-NEXT: 0x1C R_LARCH_32 .debug_frame 0x0 @@ -36,7 +39,8 @@ ; RELOCS-BOTH-NEXT: } ; RELOCS-BOTH-NEXT: ] -; SOURCE: 0000000000000000 : +; SOURCE-NORL: 0000000000000000 : +; SOURCE-ENRL: 000000000000001c : ; SOURCE: ; { ; SOURCE: ; asm volatile( ; SOURCE: ; return 0; @@ -87,11 +91,16 @@ ; DWARF-EMPTY: ; DWARF-NEXT: Address Line Column File ISA Discriminator OpIndex Flags ; DWARF-NEXT: ------------------ ------ ------ ------ --- ------------- ------- ------------- -; DWARF-NEXT: 0x0000000000000000 2 0 0 0 0 0 is_stmt -; DWARF-NEXT: 0x0000000000000010 3 3 0 0 0 0 is_stmt prologue_end -; DWARF-NEXT: 0x0000000000000020 10 3 0 0 0 0 is_stmt -; DWARF-NEXT: 0x000000000000002c 10 3 0 0 0 0 epilogue_begin -; DWARF-NEXT: 0x0000000000000034 10 3 0 0 0 0 end_sequence +; DWARF-NORL-NEXT: 0x0000000000000000 2 0 0 0 0 0 is_stmt +; DWARF-NORL-NEXT: 0x0000000000000010 3 3 0 0 0 0 is_stmt prologue_end +; DWARF-NORL-NEXT: 0x0000000000000020 10 3 0 0 0 0 is_stmt +; DWARF-NORL-NEXT: 0x000000000000002c 10 3 0 0 0 0 epilogue_begin +; DWARF-NORL-NEXT: 0x0000000000000034 10 3 0 0 0 0 end_sequence +; DWARF-ENRL-NEXT: 0x000000000000001c 2 0 0 0 0 0 is_stmt +; DWARF-ENRL-NEXT: 0x000000000000002c 3 3 0 0 0 0 is_stmt prologue_end +; DWARF-ENRL-NEXT: 0x000000000000003c 10 3 0 0 0 0 is_stmt +; DWARF-ENRL-NEXT: 0x0000000000000048 10 3 0 0 0 0 epilogue_begin +; DWARF-ENRL-NEXT: 0x0000000000000050 10 3 0 0 0 0 end_sequence ; ModuleID = 'dwarf-loongarch-relocs.c' source_filename = "dwarf-loongarch-relocs.c" diff --git a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll index 76afc4bf007c2d..8b387cd4962979 100644 --- a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll +++ b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll @@ -78,6 +78,26 @@ entry: ; CHECK-LABEL: atomic8_xchg_monotonic ; CHECK: call i8 @__tsan_atomic8_exchange(ptr %a, i8 0, i32 0), !dbg +define void @atomic8_xchg_monotonic_ptr(ptr %a, ptr %b) nounwind uwtable { +entry: + atomicrmw xchg ptr %a, ptr %b monotonic, !dbg !7 + ret void, !dbg !7 +} +; CHECK-LABEL: atomic8_xchg_monotonic_ptr +; CHECK: [[ARG:%.*]] = ptrtoint ptr %b to i64, !dbg +; CHECK: [[RES:%.*]] = call i64 @__tsan_atomic64_exchange(ptr %a, i64 [[ARG]], i32 0), !dbg +; CHECK: [[CAST:%.*]] = inttoptr i64 [[RES]] to ptr, !dbg + +define void @atomic8_xchg_monotonic_float(ptr %a, float %b) nounwind uwtable { +entry: + atomicrmw xchg ptr %a, float %b monotonic, !dbg !7 + ret void, !dbg !7 +} +; CHECK-LABEL: atomic8_xchg_monotonic_float +; CHECK: [[ARG:%.*]] = bitcast float %b to i32, !dbg +; CHECK: [[RES:%.*]] = call i32 @__tsan_atomic32_exchange(ptr %a, i32 [[ARG]], i32 0), !dbg +; CHECK: [[CAST:%.*]] = bitcast i32 [[RES]] to float, !dbg + define void @atomic8_add_monotonic(ptr %a) nounwind uwtable { entry: atomicrmw add ptr %a, i8 0 monotonic, !dbg !7 diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod.s b/llvm/test/MC/AArch64/armv8.2a-dotprod.s index 9c4a6cad7e07a6..26afbe149dd00b 100644 --- a/llvm/test/MC/AArch64/armv8.2a-dotprod.s +++ b/llvm/test/MC/AArch64/armv8.2a-dotprod.s @@ -15,6 +15,7 @@ // RUN: llvm-mc -triple aarch64 -mattr=+v8r -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: llvm-mc -triple aarch64 -mcpu=ampere1 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: llvm-mc -triple aarch64 -mcpu=ampere1a -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD +// RUN: llvm-mc -triple aarch64 -mcpu=ampere1b -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s @@ -42,6 +43,8 @@ // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s // RUN: not llvm-mc -triple aarch64 -mcpu=ampere1a -mattr=-dotprod -show-encoding < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s +// RUN: not llvm-mc -triple aarch64 -mcpu=ampere1b -mattr=-dotprod -show-encoding < %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s udot v0.2s, v1.8b, v2.8b sdot v0.2s, v1.8b, v2.8b diff --git a/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s b/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s index 235b7d44809929..3a5af86defc592 100644 --- a/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s +++ b/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s @@ -8,6 +8,10 @@ .p2align 2 _locomotive: .cfi_startproc + ; An N_ALT_ENTRY symbol can be defined in the middle of a subsection, so + ; these are opted out of the .cfi_{start,end}proc nesting check. + .alt_entry _engineer +_engineer: ret ; It is invalid to have a non-private label between .cfi_startproc and @@ -17,7 +21,7 @@ _locomotive: .p2align 2 _caboose: ; DARWIN: [[#@LINE-1]]:1: error: non-private labels cannot appear between .cfi_startproc / .cfi_endproc pairs -; DARWIN: [[#@LINE-10]]:2: error: previous .cfi_startproc was here +; DARWIN: [[#@LINE-14]]:2: error: previous .cfi_startproc was here ret .cfi_endproc diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 35411ee0ba2a5e..c9c4fceffaeb0c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -397,6 +397,51 @@ v_ctz_i32_b32 v5, src_scc v_ctz_i32_b32 v255, 0xaf123456 // GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e32 v1, s3 +// GFX12: encoding: [0x03,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, 3 +// GFX12: encoding: [0x83,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, v3 +// GFX12: encoding: [0x03,0xdb,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, s3 +// GFX12: encoding: [0x03,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, 3 +// GFX12: encoding: [0x83,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, v3 +// GFX12: encoding: [0x03,0xd9,0x02,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], s5 +// GFX12: encoding: [0x05,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], 3 +// GFX12: encoding: [0x83,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdf,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], v3 +// GFX12: encoding: [0x03,0xdf,0x06,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdd,0x04,0x7e] + v_cvt_f16_f32 v5, v1 // GFX12: encoding: [0x01,0x15,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index dd6afb28c396a7..5e0e1b688bc582 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -337,6 +337,18 @@ v_ctz_i32_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30] +v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] + +v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] + v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 6530de0268456d..36c89710ce8f89 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -73,6 +73,18 @@ v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] + v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index cf3f9c45bdcc8e..beb57999b855ea 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -1099,6 +1099,42 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 // GFX12: encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] +v_cvt_pk_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_fp8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_fp8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_pk_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_bf8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_bf8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_sr_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_fp8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_fp8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] + v_cvt_pk_i16_f32 v5, v1, v2 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index 26f63102df9508..df3430f376f69e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -1015,6 +1015,114 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_m v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index de294b1ff2a22a..09dd6df618c5b6 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -570,6 +570,54 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] +// GFX12: encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] + +v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index e35bb632906722..7ee60262a5c1b4 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -396,6 +396,144 @@ v_ctz_i32_b32_e64 v5, src_scc v_ctz_i32_b32_e64 v255, 0xaf123456 // GFX12: encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + v_cvt_f16_f32_e64 v5, v1 // GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 6b915bd14683a2..808f941197c427 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -336,6 +336,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +V_CVT_F32_FP8_e64_dpp v5, v1 quad_perm:[3,1,2,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0x27,0x00,0x2d] + +V_CVT_F32_FP8_e64_dpp v1, v3 quad_perm:[2,1,0,3] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0xc6,0x00,0x5e] + +V_CVT_F32_BF8_e64_dpp v5, v1 quad_perm:[0,3,2,1] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0x6c,0x00,0x2d] + +V_CVT_F32_BF8_e64_dpp v1, v3 quad_perm:[0,1,3,2] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0xb4,0x00,0x5e] + v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index 61266f3776c284..f7b51cfb6bda8e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -84,6 +84,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s new file mode 100644 index 00000000000000..e1cd0cab663472 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 +// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s new file mode 100644 index 00000000000000..8bd9e5039b7207 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 +// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 +// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 +// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + + + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt index 907d0c319efd57..259cb9dbc52a46 100644 --- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt +++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt @@ -14,6 +14,7 @@ # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-n2 --disassemble < %s | FileCheck %s # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1 --disassemble < %s | FileCheck %s # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1a --disassemble < %s | FileCheck %s +# RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1b --disassemble < %s | FileCheck %s # CHECK: ldaprb w0, [x0] # CHECK: ldaprh w0, [x0] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt index a839f03c42ba19..39bb7338c80748 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt @@ -397,6 +397,42 @@ # GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e32 v1, s3 ; encoding: [0x03,0xda,0x02,0x7e] +0x03,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, 3 ; encoding: [0x83,0xda,0x02,0x7e] +0x83,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, v3 ; encoding: [0x03,0xdb,0x02,0x7e] +0x03,0xdb,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, s3 ; encoding: [0x03,0xd8,0x02,0x7e] +0x03,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, 3 ; encoding: [0x83,0xd8,0x02,0x7e] +0x83,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +0x03,0xd9,0x02,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e] +0x03,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e] +0x83,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e] +0x03,0xdf,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e] +0x03,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e] +0x83,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e] +0x03,0xdd,0x04,0x7e + # GFX12: v_cvt_f16_f32_e32 v5, v1 ; encoding: [0x01,0x15,0x0a,0x7e] 0x01,0x15,0x0a,0x7e diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index bcb9ad9febb96d..5848333f41ef7c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -337,6 +337,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e + # GFX12: v_cvt_f16_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 928165997dd919..d42e9ae25039bc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -49,6 +49,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index db690aa99e4ab7..f86903b8de44b7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -993,6 +993,42 @@ # GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] 0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20 + +# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20 + # GFX12: v_cvt_pk_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index 69f61c7eb8030f..1be1d6e91ad8a7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -825,6 +825,114 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index a7f0183016147f..44b3f7594029fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -495,6 +495,54 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] +0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 4fe4284e8eb4e6..9a8368a65f3d37 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -396,6 +396,42 @@ # GFX12: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00 + # GFX12: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index e914d139e240e1..8af274e0b4028f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -336,6 +336,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 2a4b677620d387..3d48d58c775b18 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -72,6 +72,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt new file mode 100644 index 00000000000000..5079d2f0896561 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] +# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18] + + + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18] + + + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] + +[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18] + + + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt new file mode 100644 index 00000000000000..61700faa8e6075 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18] + + + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + + + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX12-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18] diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt index 1a73178a1f6a7e..d6f10e96d4769f 100644 --- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt +++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt @@ -116,14 +116,14 @@ 0x10 0x08 0x02 0x46 # CHECK: sel.s $f0, $f1, $f2 0x35 0x10 0x64 0x00 # CHECK: seleqz $2, $3, $4 0x37 0x10 0x64 0x00 # CHECK: selnez $2, $3, $4 -0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4 -0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4 +0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4 +0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4 +0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4 +0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4 -0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4 -0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4 0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4 0x14 0x10 0x04 0x46 # CHECK: seleqz.s $f0, $f2, $f4 0x14 0x10 0x24 0x46 # CHECK: seleqz.d $f0, $f2, $f4 diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt index 53ea0258e1c4bc..e1ba009f3c4c8c 100644 --- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt +++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt @@ -92,8 +92,8 @@ 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4 -0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4 -0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4 +0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4 +0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4 @@ -103,8 +103,8 @@ 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4 -0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4 -0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4 +0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4 +0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4 diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt index 9aeea45472aebb..a7dfbd209b4e48 100644 --- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt +++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt @@ -140,15 +140,15 @@ 0x43 0x00 0x50 0xec # CHECK: lwupc $2, 268 0x98 0x18 0x24 0x46 # CHECK: maddf.d $f2, $f3, $f4 0x98 0x18 0x04 0x46 # CHECK: maddf.s $f2, $f3, $f4 -0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4 -0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4 +0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4 +0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4 0x01 0x78 0x08 0x40 # CHECK: mfc0 $8, $15, 1 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4 -0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4 -0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4 +0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4 +0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4 0xda 0x10 0x64 0x00 # CHECK: mod $2, $3, $4 0xdb 0x10 0x64 0x00 # CHECK: modu $2, $3, $4 0x25 0x78 0xe0 0x03 # CHECK: move $15, $ra diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt index 32b91c6c6842e1..0030e51d6c2387 100644 --- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt +++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt @@ -111,8 +111,8 @@ 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4 -0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4 -0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4 +0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4 +0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4 @@ -122,8 +122,8 @@ 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4 -0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4 -0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4 +0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4 +0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4 diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s index 8e587aea3e7e69..4a027c6e796aea 100644 --- a/llvm/test/MC/Mips/cpsetup.s +++ b/llvm/test/MC/Mips/cpsetup.s @@ -4,8 +4,6 @@ # RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 %s | \ # RUN: FileCheck -check-prefixes=ASM,ASM-O32 %s -# FIXME: Now we check .cpsetup expansion for `-mno-shared` case only. -# We also need to implement/check the `-mshared` case. # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \ # RUN: llvm-objdump --no-print-imm-hex -d -r -z - | \ # RUN: FileCheck -check-prefixes=ALL,NXX,N32 %s @@ -35,11 +33,16 @@ t1: # NXX-NEXT: sd $gp, 8($sp) # NXX-NEXT: lui $gp, 0 -# N32-NEXT: R_MIPS_HI16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_HI16 # NXX-NEXT: addiu $gp, $gp, 0 -# N32-NEXT: R_MIPS_LO16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_LO16 +# N32-NEXT: addu $gp, $gp, $25 # N64-NEXT: daddu $gp, $gp, $25 # ASM-NEXT: .cpsetup $25, 8, __cerror @@ -64,11 +67,16 @@ t2: # NXX-NEXT: move $2, $gp # NXX-NEXT: lui $gp, 0 -# N32-NEXT: R_MIPS_HI16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_HI16 # NXX-NEXT: addiu $gp, $gp, 0 -# N32-NEXT: R_MIPS_LO16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_LO16 +# N32-NEXT: addu $gp, $gp, $25 # N64-NEXT: daddu $gp, $gp, $25 # ASM-NEXT: .cpsetup $25, $2, __cerror @@ -101,11 +109,16 @@ t3: # NXX-NEXT: move $2, $gp # NXX-NEXT: lui $gp, 0 -# N32-NEXT: {{^ *0+}}38: R_MIPS_HI16 __gnu_local_gp # N64-NEXT: {{^ *0+}}40: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 .text +# N32-NEXT: {{^ *0+}}40: R_MIPS_GPREL16 .text +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_HI16 # NXX-NEXT: addiu $gp, $gp, 0 -# N32-NEXT: {{^ *0+}}3c: R_MIPS_LO16 __gnu_local_gp # N64-NEXT: {{^ *0+}}44: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 .text +# N32-NEXT: {{^ *0+}}44: R_MIPS_GPREL16 .text +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_LO16 +# N32-NEXT: addu $gp, $gp, $25 # N64-NEXT: daddu $gp, $gp, $25 # NXX-NEXT: nop # NXX-NEXT: sub $3, $3, $2 @@ -158,11 +171,16 @@ t5: # NXX-NEXT: sd $gp, 8($sp) # NXX-NEXT: lui $gp, 0 -# N32-NEXT: R_MIPS_HI16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_HI16 # NXX-NEXT: addiu $gp, $gp, 0 -# N32-NEXT: R_MIPS_LO16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_LO16 +# N32-NEXT: addu $gp, $gp, $25 # N64-NEXT: daddu $gp, $gp, $25 # ASM-NEXT: .cpsetup $25, 8, __cerror @@ -184,11 +202,16 @@ IMM_8 = 8 # NXX-NEXT: sd $gp, 8($sp) # NXX-NEXT: lui $gp, 0 -# N32-NEXT: R_MIPS_HI16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_HI16 # NXX-NEXT: addiu $gp, $gp, 0 -# N32-NEXT: R_MIPS_LO16 __gnu_local_gp # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 __cerror +# N32-NEXT: R_MIPS_GPREL16 __cerror +# N32-NEXT: R_MIPS_SUB +# N32-NEXT: R_MIPS_LO16 +# N32-NEXT: addu $gp, $gp, $25 # N64-NEXT: daddu $gp, $gp, $25 # ASM-NEXT: .cpsetup $25, 8, __cerror diff --git a/llvm/test/MC/Mips/forbidden-slot.s b/llvm/test/MC/Mips/forbidden-slot.s new file mode 100644 index 00000000000000..da98e70561695f --- /dev/null +++ b/llvm/test/MC/Mips/forbidden-slot.s @@ -0,0 +1,18 @@ +# RUN: llvm-mc -assemble -mcpu=mips64r6 -arch=mips64el -filetype=obj %s -o tmp.o +# RUN: llvm-objdump -d tmp.o | FileCheck %s --check-prefix=MIPSELR6 + +# MIPSELR6: 0000000000000000 : +# MIPSELR6-NEXT: beqzc $13, 0x0 +# MIPSELR6-NEXT: b 0x0 +# MIPSELR6: 0000000000000008 : +# MIPSELR6-NEXT: beqzc $13, 0x8 +# MIPSELR6-NEXT: nop +# MIPSELR6: b 0x8 + .set noreorder +aaa: + beqzc $t1, aaa + b aaa + .set reorder +bbb: + beqzc $t1, bbb + b bbb diff --git a/llvm/test/MC/Mips/macro-la-pic.s b/llvm/test/MC/Mips/macro-la-pic.s index 2303f34c35bcfe..1875952d80c4e7 100644 --- a/llvm/test/MC/Mips/macro-la-pic.s +++ b/llvm/test/MC/Mips/macro-la-pic.s @@ -255,3 +255,25 @@ la $25, 2f # XN32: lw $25, %got_disp(.Ltmp1)($gp) # encoding: [0x8f,0x99,A,A] # XN32: # fixup A - offset: 0, value: %got_disp(.Ltmp1), kind: fixup_Mips_GOT_DISP 2: + +la $2,.Lstr +# O32: lw $2, %got(.Lstr)($gp) # encoding: [0x8f,0x82,A,A] +# O32-NEXT: # fixup A - offset: 0, value: %got(.Lstr), kind: fixup_Mips_GOT +# O32-NEXT: addiu $2, $2, %lo(.Lstr) # encoding: [0x24,0x42,A,A] +# O32-NEXT: # fixup A - offset: 0, value: %lo(.Lstr), kind: fixup_Mips_LO16 + +# N32: lw $2, %got_disp(.Lstr)($gp) # encoding: [0x8f,0x82,A,A] +# N32-NEXT: # fixup A - offset: 0, value: %got_disp(.Lstr), kind: fixup_Mips_GOT_DISP + +la $2,$str2 +# O32: lw $2, %got($str2)($gp) # encoding: [0x8f,0x82,A,A] +# O32-NEXT: # fixup A - offset: 0, value: %got($str2), kind: fixup_Mips_GOT +# O32-NEXT: addiu $2, $2, %lo($str2) # encoding: [0x24,0x42,A,A] +# O32-NEXT: # fixup A - offset: 0, value: %lo($str2), kind: fixup_Mips_LO16 + +# N32: lw $2, %got_disp($str2)($gp) # encoding: [0x8f,0x82,A,A] +# N32-NEXT: # fixup A - offset: 0, value: %got_disp($str2), kind: fixup_Mips_GOT_DISP + +.rodata +.Lstr: .4byte 0 +$str2: .4byte 0 diff --git a/llvm/test/MC/Mips/mips32r6/relocations.s b/llvm/test/MC/Mips/mips32r6/relocations.s index dfd75e633bc2f7..8d4464bbbed77a 100644 --- a/llvm/test/MC/Mips/mips32r6/relocations.s +++ b/llvm/test/MC/Mips/mips32r6/relocations.s @@ -52,17 +52,17 @@ # CHECK-ELF: Relocations [ # CHECK-ELF: 0x0 R_MIPS_PC19_S2 bar # CHECK-ELF: 0x4 R_MIPS_PC16 bar -# CHECK-ELF: 0x8 R_MIPS_PC16 bar -# CHECK-ELF: 0xC R_MIPS_PC21_S2 bar -# CHECK-ELF: 0x10 R_MIPS_PC21_S2 bar -# CHECK-ELF: 0x14 R_MIPS_PC26_S2 bar -# CHECK-ELF: 0x18 R_MIPS_PC26_S2 bar -# CHECK-ELF: 0x1C R_MIPS_PCHI16 bar -# CHECK-ELF: 0x20 R_MIPS_PCLO16 bar -# CHECK-ELF: 0x24 R_MIPS_PC19_S2 bar -# CHECK-ELF: 0x28 R_MIPS_PC19_S2 bar -# CHECK-ELF: 0x2C R_MIPS_LO16 bar -# CHECK-ELF: 0x30 R_MIPS_LO16 bar +# CHECK-ELF: 0xC R_MIPS_PC16 bar +# CHECK-ELF: 0x14 R_MIPS_PC21_S2 bar +# CHECK-ELF: 0x1C R_MIPS_PC21_S2 bar +# CHECK-ELF: 0x24 R_MIPS_PC26_S2 bar +# CHECK-ELF: 0x28 R_MIPS_PC26_S2 bar +# CHECK-ELF: 0x2C R_MIPS_PCHI16 bar +# CHECK-ELF: 0x30 R_MIPS_PCLO16 bar +# CHECK-ELF: 0x34 R_MIPS_PC19_S2 bar +# CHECK-ELF: 0x38 R_MIPS_PC19_S2 bar +# CHECK-ELF: 0x3C R_MIPS_LO16 bar +# CHECK-ELF: 0x40 R_MIPS_LO16 bar # CHECK-ELF: ] addiupc $2,bar diff --git a/llvm/test/MC/Mips/mips32r6/valid.s b/llvm/test/MC/Mips/mips32r6/valid.s index 0f098a176a67cc..0d705b6f242615 100644 --- a/llvm/test/MC/Mips/mips32r6/valid.s +++ b/llvm/test/MC/Mips/mips32r6/valid.s @@ -170,14 +170,14 @@ a: sel.s $f0,$f1,$f2 # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10] seleqz $2,$3,$4 # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35] selnez $2,$3,$4 # CHECK: selnez $2, $3, $4 # encoding: [0x00,0x64,0x10,0x37] - max.s $f0, $f2, $f4 # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d] - max.d $f0, $f2, $f4 # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d] + max.s $f0, $f2, $f4 # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e] + max.d $f0, $f2, $f4 # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e] min.s $f0, $f2, $f4 # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c] min.d $f0, $f2, $f4 # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c] maxa.s $f0, $f2, $f4 # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f] maxa.d $f0, $f2, $f4 # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f] - mina.s $f0, $f2, $f4 # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e] - mina.d $f0, $f2, $f4 # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e] + mina.s $f0, $f2, $f4 # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d] + mina.d $f0, $f2, $f4 # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d] or $2, 4 # CHECK: ori $2, $2, 4 # encoding: [0x34,0x42,0x00,0x04] seleqz.s $f0, $f2, $f4 # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14] seleqz.d $f0, $f2, $f4 # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14] diff --git a/llvm/test/MC/Mips/mips64r6/relocations.s b/llvm/test/MC/Mips/mips64r6/relocations.s index 8353ec019a3ca8..8b02be37284aaa 100644 --- a/llvm/test/MC/Mips/mips64r6/relocations.s +++ b/llvm/test/MC/Mips/mips64r6/relocations.s @@ -59,19 +59,19 @@ # CHECK-ELF: Relocations [ # CHECK-ELF: 0x0 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: 0x4 R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC -# CHECK-ELF: 0x8 R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC -# CHECK-ELF: 0xC R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC -# CHECK-ELF: 0x10 R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC -# CHECK-ELF: 0x14 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC -# CHECK-ELF: 0x18 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC -# CHECK-ELF: 0x1C R_MIPS_PCHI16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 -# CHECK-ELF: 0x20 R_MIPS_PCLO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 -# CHECK-ELF: 0x24 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 -# CHECK-ELF: 0x28 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0 -# CHECK-ELF: 0x2C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 -# CHECK-ELF: 0x30 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 -# CHECK-ELF: 0x34 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 -# CHECK-ELF: 0x38 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0xC R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC +# CHECK-ELF: 0x14 R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC +# CHECK-ELF: 0x1C R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC +# CHECK-ELF: 0x24 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC +# CHECK-ELF: 0x28 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC +# CHECK-ELF: 0x2C R_MIPS_PCHI16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x30 R_MIPS_PCLO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x34 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x38 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x3C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x40 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x44 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x48 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: ] addiupc $2,bar diff --git a/llvm/test/MC/Mips/mips64r6/valid.s b/llvm/test/MC/Mips/mips64r6/valid.s index c50bd9e31c232e..ff6e1d73fbeb48 100644 --- a/llvm/test/MC/Mips/mips64r6/valid.s +++ b/llvm/test/MC/Mips/mips64r6/valid.s @@ -183,14 +183,14 @@ a: lwupc $2,268 # CHECK: lwupc $2, 268 # encoding: [0xec,0x50,0x00,0x43] maddf.d $f2,$f3,$f4 # CHECK: maddf.d $f2, $f3, $f4 # encoding: [0x46,0x24,0x18,0x98] maddf.s $f2,$f3,$f4 # CHECK: maddf.s $f2, $f3, $f4 # encoding: [0x46,0x04,0x18,0x98] - max.d $f0, $f2, $f4 # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d] - max.s $f0, $f2, $f4 # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d] + max.d $f0, $f2, $f4 # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e] + max.s $f0, $f2, $f4 # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e] maxa.d $f0, $f2, $f4 # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f] maxa.s $f0, $f2, $f4 # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f] min.d $f0, $f2, $f4 # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c] min.s $f0, $f2, $f4 # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c] - mina.d $f0, $f2, $f4 # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e] - mina.s $f0, $f2, $f4 # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e] + mina.d $f0, $f2, $f4 # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d] + mina.s $f0, $f2, $f4 # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d] mfc0 $8,$15,1 # CHECK: mfc0 $8, $15, 1 # encoding: [0x40,0x08,0x78,0x01] mod $2,$3,$4 # CHECK: mod $2, $3, $4 # encoding: [0x00,0x64,0x10,0xda] modu $2,$3,$4 # CHECK: modu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xdb] diff --git a/llvm/test/MC/Mips/relocation.s b/llvm/test/MC/Mips/relocation.s index 9c8bb657ea68c1..a92c62744fcaa5 100644 --- a/llvm/test/MC/Mips/relocation.s +++ b/llvm/test/MC/Mips/relocation.s @@ -237,7 +237,7 @@ baz: .long foo // RELOC: R_MIPS_32 foo // ENCLE: addiu $2, $3, %tprel_lo(foo) # encoding: [A,A,0x62,0x24] // FIXUP: # fixup A - offset: 0, value: %tprel_lo(foo), kind: fixup_Mips_TPREL_LO -// DATA-NEXT: 00C0: D85FFFFF CBFFFFFF EC580000 EC480000 +// DATA-NEXT: 00C0: D85FFFFF 00000000 CBFFFFFF EC580000 // ?????: R_MIPS_GLOB_DAT foo .set mips32r6 beqzc $2, foo // RELOC: R_MIPS_PC21_S2 foo @@ -262,7 +262,7 @@ baz: .long foo // RELOC: R_MIPS_32 foo // ENCLE: lwpc $2, foo # encoding: [A,A,0b01001AAA,0xec] // FIXUP: # fixup A - offset: 0, value: foo, kind: fixup_MIPS_PC19_S2 -// DATA-NEXT: 00D0: 24620000 24620000 00000000 +// DATA-NEXT: 00D0: EC480000 24620000 24620000 00000000 addiu $2, $3, %pcrel_hi(foo) // RELOC: R_MIPS_PCHI16 foo // ENCBE: addiu $2, $3, %pcrel_hi(foo) # encoding: [0x24,0x62,A,A] // ENCLE: addiu $2, $3, %pcrel_hi(foo) # encoding: [A,A,0x62,0x24] diff --git a/llvm/test/MC/RISCV/rv32zicond-invalid.s b/llvm/test/MC/RISCV/rv32zicond-invalid.s index a350593993b525..02f5d1777b0ebc 100644 --- a/llvm/test/MC/RISCV/rv32zicond-invalid.s +++ b/llvm/test/MC/RISCV/rv32zicond-invalid.s @@ -1,5 +1,5 @@ -# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicond < %s 2>&1 | FileCheck %s -# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicond < %s 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple riscv32 -mattr=+zicond < %s 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple riscv64 -mattr=+zicond < %s 2>&1 | FileCheck %s # Use of operand modifier on register name czero.eqz t1, %lo(t2), t3 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv32zicond-valid.s b/llvm/test/MC/RISCV/rv32zicond-valid.s index e6deb81301eca7..c862f04b806788 100644 --- a/llvm/test/MC/RISCV/rv32zicond-valid.s +++ b/llvm/test/MC/RISCV/rv32zicond-valid.s @@ -1,12 +1,12 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicond -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zicond -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicond -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zicond -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicond < %s \ -# RUN: | llvm-objdump --mattr=+experimental-zicond -d -r - \ +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zicond < %s \ +# RUN: | llvm-objdump --mattr=+zicond -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zicond < %s \ -# RUN: | llvm-objdump --mattr=+experimental-zicond -d -r - \ +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zicond < %s \ +# RUN: | llvm-objdump --mattr=+zicond -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # CHECK-ASM-AND-OBJ: czero.eqz t0, a3, ra diff --git a/llvm/test/TableGen/MacroFusion.td b/llvm/test/TableGen/MacroFusion.td index 4aa6c8d9acb273..ce76e7f0f7fa64 100644 --- a/llvm/test/TableGen/MacroFusion.td +++ b/llvm/test/TableGen/MacroFusion.td @@ -34,6 +34,11 @@ let Namespace = "Test" in { def Inst0 : TestInst<0>; def Inst1 : TestInst<1>; +def BothFusionPredicate: BothFusionPredicateWithMCInstPredicate>; +def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusionPredicate", + "Test BothFusionPredicate", + [BothFusionPredicate]>; + def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion", CheckOpcode<[Inst0]>, CheckAll<[ @@ -45,6 +50,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion", // CHECK-PREDICATOR-NEXT: #undef GET_Test_MACRO_FUSION_PRED_DECL // CHECK-PREDICATOR-EMPTY: // CHECK-PREDICATOR-NEXT: namespace llvm { +// CHECK-PREDICATOR-NEXT: bool isTestBothFusionPredicate(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &); // CHECK-PREDICATOR-NEXT: bool isTestFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &); // CHECK-PREDICATOR-NEXT: } // end namespace llvm // CHECK-PREDICATOR-EMPTY: @@ -54,6 +60,24 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion", // CHECK-PREDICATOR-NEXT: #undef GET_Test_MACRO_FUSION_PRED_IMPL // CHECK-PREDICATOR-EMPTY: // CHECK-PREDICATOR-NEXT: namespace llvm { +// CHECK-PREDICATOR-NEXT: bool isTestBothFusionPredicate( +// CHECK-PREDICATOR-NEXT: const TargetInstrInfo &TII, +// CHECK-PREDICATOR-NEXT: const TargetSubtargetInfo &STI, +// CHECK-PREDICATOR-NEXT: const MachineInstr *FirstMI, +// CHECK-PREDICATOR-NEXT: const MachineInstr &SecondMI) { +// CHECK-PREDICATOR-NEXT: auto &MRI = SecondMI.getMF()->getRegInfo(); +// CHECK-PREDICATOR-NEXT: { +// CHECK-PREDICATOR-NEXT: const MachineInstr *MI = FirstMI; +// CHECK-PREDICATOR-NEXT: if (MI->getOperand(0).getReg() != Test::X0) +// CHECK-PREDICATOR-NEXT: return false; +// CHECK-PREDICATOR-NEXT: } +// CHECK-PREDICATOR-NEXT: { +// CHECK-PREDICATOR-NEXT: const MachineInstr *MI = &SecondMI; +// CHECK-PREDICATOR-NEXT: if (MI->getOperand(0).getReg() != Test::X0) +// CHECK-PREDICATOR-NEXT: return false; +// CHECK-PREDICATOR-NEXT: } +// CHECK-PREDICATOR-NEXT: return true; +// CHECK-PREDICATOR-NEXT: } // CHECK-PREDICATOR-NEXT: bool isTestFusion( // CHECK-PREDICATOR-NEXT: const TargetInstrInfo &TII, // CHECK-PREDICATOR-NEXT: const TargetSubtargetInfo &STI, @@ -106,6 +130,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion", // CHECK-SUBTARGET: std::vector TestGenSubtargetInfo::getMacroFusions() const { // CHECK-SUBTARGET-NEXT: std::vector Fusions; +// CHECK-SUBTARGET-NEXT: if (hasFeature(Test::TestBothFusionPredicate)) Fusions.push_back(llvm::isTestBothFusionPredicate); // CHECK-SUBTARGET-NEXT: if (hasFeature(Test::TestFusion)) Fusions.push_back(llvm::isTestFusion); // CHECK-SUBTARGET-NEXT: return Fusions; // CHECK-SUBTARGET-NEXT: } diff --git a/llvm/test/TableGen/address-space-patfrags.td b/llvm/test/TableGen/address-space-patfrags.td index 4aec6ea7e0eae8..46050a70720fbe 100644 --- a/llvm/test/TableGen/address-space-patfrags.td +++ b/llvm/test/TableGen/address-space-patfrags.td @@ -46,7 +46,7 @@ def inst_d : Instruction { let InOperandList = (ins GPR32:$src0, GPR32:$src1); } -// SDAG: case 1: { +// SDAG: case 0: { // SDAG-NEXT: // Predicate_pat_frag_b // SDAG-NEXT: // Predicate_truncstorei16_addrspace // SDAG-NEXT: SDNode *N = Node; @@ -69,7 +69,7 @@ def : Pat < >; -// SDAG: case 6: { +// SDAG: case 4: { // SDAG: // Predicate_pat_frag_a // SDAG-NEXT: SDNode *N = Node; // SDAG-NEXT: (void)N; diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index 9b12e4af00bf74..d6d96737695011 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -6608,15 +6608,15 @@ static const X86FoldTableEntry Table4[] = { }; static const X86FoldTableEntry BroadcastTable1[] = { - {X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rmb, TB_BCAST_SD}, - {X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rmb, TB_BCAST_SD}, - {X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrmb, TB_BCAST_SD}, - {X86::VCVTDQ2PHZ128rr, X86::VCVTDQ2PHZ128rmb, TB_BCAST_SH}, - {X86::VCVTDQ2PHZ256rr, X86::VCVTDQ2PHZ256rmb, TB_BCAST_SH}, - {X86::VCVTDQ2PHZrr, X86::VCVTDQ2PHZrmb, TB_BCAST_SH}, - {X86::VCVTDQ2PSZ128rr, X86::VCVTDQ2PSZ128rmb, TB_BCAST_SS}, - {X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rmb, TB_BCAST_SS}, - {X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrmb, TB_BCAST_SS}, + {X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rmb, TB_BCAST_D}, + {X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rmb, TB_BCAST_D}, + {X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrmb, TB_BCAST_D}, + {X86::VCVTDQ2PHZ128rr, X86::VCVTDQ2PHZ128rmb, TB_BCAST_D}, + {X86::VCVTDQ2PHZ256rr, X86::VCVTDQ2PHZ256rmb, TB_BCAST_D}, + {X86::VCVTDQ2PHZrr, X86::VCVTDQ2PHZrmb, TB_BCAST_D}, + {X86::VCVTDQ2PSZ128rr, X86::VCVTDQ2PSZ128rmb, TB_BCAST_D}, + {X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rmb, TB_BCAST_D}, + {X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrmb, TB_BCAST_D}, {X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rmb, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rmb, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrmb, TB_BCAST_SS}, @@ -6626,9 +6626,9 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VCVTPD2PHZ128rr, X86::VCVTPD2PHZ128rmb, TB_BCAST_SD}, {X86::VCVTPD2PHZ256rr, X86::VCVTPD2PHZ256rmb, TB_BCAST_SD}, {X86::VCVTPD2PHZrr, X86::VCVTPD2PHZrmb, TB_BCAST_SD}, - {X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rmb, TB_BCAST_SS}, - {X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rmb, TB_BCAST_SS}, - {X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrmb, TB_BCAST_SS}, + {X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rmb, TB_BCAST_SD}, + {X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rmb, TB_BCAST_SD}, + {X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrmb, TB_BCAST_SD}, {X86::VCVTPD2QQZ128rr, X86::VCVTPD2QQZ128rmb, TB_BCAST_SD}, {X86::VCVTPD2QQZ256rr, X86::VCVTPD2QQZ256rmb, TB_BCAST_SD}, {X86::VCVTPD2QQZrr, X86::VCVTPD2QQZrmb, TB_BCAST_SD}, @@ -6680,15 +6680,15 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VCVTPS2UQQZ128rr, X86::VCVTPS2UQQZ128rmb, TB_BCAST_SS}, {X86::VCVTPS2UQQZ256rr, X86::VCVTPS2UQQZ256rmb, TB_BCAST_SS}, {X86::VCVTPS2UQQZrr, X86::VCVTPS2UQQZrmb, TB_BCAST_SS}, - {X86::VCVTQQ2PDZ128rr, X86::VCVTQQ2PDZ128rmb, TB_BCAST_SD}, - {X86::VCVTQQ2PDZ256rr, X86::VCVTQQ2PDZ256rmb, TB_BCAST_SD}, - {X86::VCVTQQ2PDZrr, X86::VCVTQQ2PDZrmb, TB_BCAST_SD}, - {X86::VCVTQQ2PHZ128rr, X86::VCVTQQ2PHZ128rmb, TB_BCAST_SH}, - {X86::VCVTQQ2PHZ256rr, X86::VCVTQQ2PHZ256rmb, TB_BCAST_SH}, - {X86::VCVTQQ2PHZrr, X86::VCVTQQ2PHZrmb, TB_BCAST_SH}, - {X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rmb, TB_BCAST_SS}, - {X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rmb, TB_BCAST_SS}, - {X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrmb, TB_BCAST_SS}, + {X86::VCVTQQ2PDZ128rr, X86::VCVTQQ2PDZ128rmb, TB_BCAST_Q}, + {X86::VCVTQQ2PDZ256rr, X86::VCVTQQ2PDZ256rmb, TB_BCAST_Q}, + {X86::VCVTQQ2PDZrr, X86::VCVTQQ2PDZrmb, TB_BCAST_Q}, + {X86::VCVTQQ2PHZ128rr, X86::VCVTQQ2PHZ128rmb, TB_BCAST_Q}, + {X86::VCVTQQ2PHZ256rr, X86::VCVTQQ2PHZ256rmb, TB_BCAST_Q}, + {X86::VCVTQQ2PHZrr, X86::VCVTQQ2PHZrmb, TB_BCAST_Q}, + {X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rmb, TB_BCAST_Q}, + {X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rmb, TB_BCAST_Q}, + {X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrmb, TB_BCAST_Q}, {X86::VCVTTPD2DQZ128rr, X86::VCVTTPD2DQZ128rmb, TB_BCAST_SD}, {X86::VCVTTPD2DQZ256rr, X86::VCVTTPD2DQZ256rmb, TB_BCAST_SD}, {X86::VCVTTPD2DQZrr, X86::VCVTTPD2DQZrmb, TB_BCAST_SD}, @@ -6731,30 +6731,30 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VCVTTPS2UQQZ128rr, X86::VCVTTPS2UQQZ128rmb, TB_BCAST_SS}, {X86::VCVTTPS2UQQZ256rr, X86::VCVTTPS2UQQZ256rmb, TB_BCAST_SS}, {X86::VCVTTPS2UQQZrr, X86::VCVTTPS2UQQZrmb, TB_BCAST_SS}, - {X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rmb, TB_BCAST_SD}, - {X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rmb, TB_BCAST_SD}, - {X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrmb, TB_BCAST_SD}, - {X86::VCVTUDQ2PHZ128rr, X86::VCVTUDQ2PHZ128rmb, TB_BCAST_SH}, - {X86::VCVTUDQ2PHZ256rr, X86::VCVTUDQ2PHZ256rmb, TB_BCAST_SH}, - {X86::VCVTUDQ2PHZrr, X86::VCVTUDQ2PHZrmb, TB_BCAST_SH}, - {X86::VCVTUDQ2PSZ128rr, X86::VCVTUDQ2PSZ128rmb, TB_BCAST_SS}, - {X86::VCVTUDQ2PSZ256rr, X86::VCVTUDQ2PSZ256rmb, TB_BCAST_SS}, - {X86::VCVTUDQ2PSZrr, X86::VCVTUDQ2PSZrmb, TB_BCAST_SS}, - {X86::VCVTUQQ2PDZ128rr, X86::VCVTUQQ2PDZ128rmb, TB_BCAST_SD}, - {X86::VCVTUQQ2PDZ256rr, X86::VCVTUQQ2PDZ256rmb, TB_BCAST_SD}, - {X86::VCVTUQQ2PDZrr, X86::VCVTUQQ2PDZrmb, TB_BCAST_SD}, - {X86::VCVTUQQ2PHZ128rr, X86::VCVTUQQ2PHZ128rmb, TB_BCAST_SH}, - {X86::VCVTUQQ2PHZ256rr, X86::VCVTUQQ2PHZ256rmb, TB_BCAST_SH}, - {X86::VCVTUQQ2PHZrr, X86::VCVTUQQ2PHZrmb, TB_BCAST_SH}, - {X86::VCVTUQQ2PSZ128rr, X86::VCVTUQQ2PSZ128rmb, TB_BCAST_SS}, - {X86::VCVTUQQ2PSZ256rr, X86::VCVTUQQ2PSZ256rmb, TB_BCAST_SS}, - {X86::VCVTUQQ2PSZrr, X86::VCVTUQQ2PSZrmb, TB_BCAST_SS}, - {X86::VCVTUW2PHZ128rr, X86::VCVTUW2PHZ128rmb, TB_BCAST_SH}, - {X86::VCVTUW2PHZ256rr, X86::VCVTUW2PHZ256rmb, TB_BCAST_SH}, - {X86::VCVTUW2PHZrr, X86::VCVTUW2PHZrmb, TB_BCAST_SH}, - {X86::VCVTW2PHZ128rr, X86::VCVTW2PHZ128rmb, TB_BCAST_SH}, - {X86::VCVTW2PHZ256rr, X86::VCVTW2PHZ256rmb, TB_BCAST_SH}, - {X86::VCVTW2PHZrr, X86::VCVTW2PHZrmb, TB_BCAST_SH}, + {X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rmb, TB_BCAST_D}, + {X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rmb, TB_BCAST_D}, + {X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrmb, TB_BCAST_D}, + {X86::VCVTUDQ2PHZ128rr, X86::VCVTUDQ2PHZ128rmb, TB_BCAST_D}, + {X86::VCVTUDQ2PHZ256rr, X86::VCVTUDQ2PHZ256rmb, TB_BCAST_D}, + {X86::VCVTUDQ2PHZrr, X86::VCVTUDQ2PHZrmb, TB_BCAST_D}, + {X86::VCVTUDQ2PSZ128rr, X86::VCVTUDQ2PSZ128rmb, TB_BCAST_D}, + {X86::VCVTUDQ2PSZ256rr, X86::VCVTUDQ2PSZ256rmb, TB_BCAST_D}, + {X86::VCVTUDQ2PSZrr, X86::VCVTUDQ2PSZrmb, TB_BCAST_D}, + {X86::VCVTUQQ2PDZ128rr, X86::VCVTUQQ2PDZ128rmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PDZ256rr, X86::VCVTUQQ2PDZ256rmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PDZrr, X86::VCVTUQQ2PDZrmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZ128rr, X86::VCVTUQQ2PHZ128rmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZ256rr, X86::VCVTUQQ2PHZ256rmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZrr, X86::VCVTUQQ2PHZrmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZ128rr, X86::VCVTUQQ2PSZ128rmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZ256rr, X86::VCVTUQQ2PSZ256rmb, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZrr, X86::VCVTUQQ2PSZrmb, TB_BCAST_Q}, + {X86::VCVTUW2PHZ128rr, X86::VCVTUW2PHZ128rmb, TB_BCAST_W}, + {X86::VCVTUW2PHZ256rr, X86::VCVTUW2PHZ256rmb, TB_BCAST_W}, + {X86::VCVTUW2PHZrr, X86::VCVTUW2PHZrmb, TB_BCAST_W}, + {X86::VCVTW2PHZ128rr, X86::VCVTW2PHZ128rmb, TB_BCAST_W}, + {X86::VCVTW2PHZ256rr, X86::VCVTW2PHZ256rmb, TB_BCAST_W}, + {X86::VCVTW2PHZrr, X86::VCVTW2PHZrmb, TB_BCAST_W}, {X86::VEXP2PDZr, X86::VEXP2PDZmb, TB_BCAST_SD}, {X86::VEXP2PSZr, X86::VEXP2PSZmb, TB_BCAST_SS}, {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rmb, TB_BCAST_SD}, @@ -6945,15 +6945,15 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS}, {X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS}, {X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS}, - {X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmbkz, TB_BCAST_SD}, - {X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmbkz, TB_BCAST_SD}, - {X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmbkz, TB_BCAST_SD}, - {X86::VCVTDQ2PHZ128rrkz, X86::VCVTDQ2PHZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTDQ2PHZ256rrkz, X86::VCVTDQ2PHZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTDQ2PHZrrkz, X86::VCVTDQ2PHZrmbkz, TB_BCAST_SH}, - {X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmbkz, TB_BCAST_SS}, - {X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmbkz, TB_BCAST_SS}, - {X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmbkz, TB_BCAST_SS}, + {X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PHZ128rrkz, X86::VCVTDQ2PHZ128rmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PHZ256rrkz, X86::VCVTDQ2PHZ256rmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PHZrrkz, X86::VCVTDQ2PHZrmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmbkz, TB_BCAST_D}, + {X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmbkz, TB_BCAST_D}, {X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rmb, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rmb, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrmb, TB_BCAST_SS}, @@ -6966,9 +6966,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTPD2PHZ128rrkz, X86::VCVTPD2PHZ128rmbkz, TB_BCAST_SD}, {X86::VCVTPD2PHZ256rrkz, X86::VCVTPD2PHZ256rmbkz, TB_BCAST_SD}, {X86::VCVTPD2PHZrrkz, X86::VCVTPD2PHZrmbkz, TB_BCAST_SD}, - {X86::VCVTPD2PSZ128rrkz, X86::VCVTPD2PSZ128rmbkz, TB_BCAST_SS}, - {X86::VCVTPD2PSZ256rrkz, X86::VCVTPD2PSZ256rmbkz, TB_BCAST_SS}, - {X86::VCVTPD2PSZrrkz, X86::VCVTPD2PSZrmbkz, TB_BCAST_SS}, + {X86::VCVTPD2PSZ128rrkz, X86::VCVTPD2PSZ128rmbkz, TB_BCAST_SD}, + {X86::VCVTPD2PSZ256rrkz, X86::VCVTPD2PSZ256rmbkz, TB_BCAST_SD}, + {X86::VCVTPD2PSZrrkz, X86::VCVTPD2PSZrmbkz, TB_BCAST_SD}, {X86::VCVTPD2QQZ128rrkz, X86::VCVTPD2QQZ128rmbkz, TB_BCAST_SD}, {X86::VCVTPD2QQZ256rrkz, X86::VCVTPD2QQZ256rmbkz, TB_BCAST_SD}, {X86::VCVTPD2QQZrrkz, X86::VCVTPD2QQZrmbkz, TB_BCAST_SD}, @@ -7020,15 +7020,15 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTPS2UQQZ128rrkz, X86::VCVTPS2UQQZ128rmbkz, TB_BCAST_SS}, {X86::VCVTPS2UQQZ256rrkz, X86::VCVTPS2UQQZ256rmbkz, TB_BCAST_SS}, {X86::VCVTPS2UQQZrrkz, X86::VCVTPS2UQQZrmbkz, TB_BCAST_SS}, - {X86::VCVTQQ2PDZ128rrkz, X86::VCVTQQ2PDZ128rmbkz, TB_BCAST_SD}, - {X86::VCVTQQ2PDZ256rrkz, X86::VCVTQQ2PDZ256rmbkz, TB_BCAST_SD}, - {X86::VCVTQQ2PDZrrkz, X86::VCVTQQ2PDZrmbkz, TB_BCAST_SD}, - {X86::VCVTQQ2PHZ128rrkz, X86::VCVTQQ2PHZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTQQ2PHZ256rrkz, X86::VCVTQQ2PHZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTQQ2PHZrrkz, X86::VCVTQQ2PHZrmbkz, TB_BCAST_SH}, - {X86::VCVTQQ2PSZ128rrkz, X86::VCVTQQ2PSZ128rmbkz, TB_BCAST_SS}, - {X86::VCVTQQ2PSZ256rrkz, X86::VCVTQQ2PSZ256rmbkz, TB_BCAST_SS}, - {X86::VCVTQQ2PSZrrkz, X86::VCVTQQ2PSZrmbkz, TB_BCAST_SS}, + {X86::VCVTQQ2PDZ128rrkz, X86::VCVTQQ2PDZ128rmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PDZ256rrkz, X86::VCVTQQ2PDZ256rmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PDZrrkz, X86::VCVTQQ2PDZrmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PHZ128rrkz, X86::VCVTQQ2PHZ128rmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PHZ256rrkz, X86::VCVTQQ2PHZ256rmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PHZrrkz, X86::VCVTQQ2PHZrmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PSZ128rrkz, X86::VCVTQQ2PSZ128rmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PSZ256rrkz, X86::VCVTQQ2PSZ256rmbkz, TB_BCAST_Q}, + {X86::VCVTQQ2PSZrrkz, X86::VCVTQQ2PSZrmbkz, TB_BCAST_Q}, {X86::VCVTTPD2DQZ128rrkz, X86::VCVTTPD2DQZ128rmbkz, TB_BCAST_SD}, {X86::VCVTTPD2DQZ256rrkz, X86::VCVTTPD2DQZ256rmbkz, TB_BCAST_SD}, {X86::VCVTTPD2DQZrrkz, X86::VCVTTPD2DQZrmbkz, TB_BCAST_SD}, @@ -7071,30 +7071,30 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTTPS2UQQZ128rrkz, X86::VCVTTPS2UQQZ128rmbkz, TB_BCAST_SS}, {X86::VCVTTPS2UQQZ256rrkz, X86::VCVTTPS2UQQZ256rmbkz, TB_BCAST_SS}, {X86::VCVTTPS2UQQZrrkz, X86::VCVTTPS2UQQZrmbkz, TB_BCAST_SS}, - {X86::VCVTUDQ2PDZ128rrkz, X86::VCVTUDQ2PDZ128rmbkz, TB_BCAST_SD}, - {X86::VCVTUDQ2PDZ256rrkz, X86::VCVTUDQ2PDZ256rmbkz, TB_BCAST_SD}, - {X86::VCVTUDQ2PDZrrkz, X86::VCVTUDQ2PDZrmbkz, TB_BCAST_SD}, - {X86::VCVTUDQ2PHZ128rrkz, X86::VCVTUDQ2PHZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTUDQ2PHZ256rrkz, X86::VCVTUDQ2PHZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTUDQ2PHZrrkz, X86::VCVTUDQ2PHZrmbkz, TB_BCAST_SH}, - {X86::VCVTUDQ2PSZ128rrkz, X86::VCVTUDQ2PSZ128rmbkz, TB_BCAST_SS}, - {X86::VCVTUDQ2PSZ256rrkz, X86::VCVTUDQ2PSZ256rmbkz, TB_BCAST_SS}, - {X86::VCVTUDQ2PSZrrkz, X86::VCVTUDQ2PSZrmbkz, TB_BCAST_SS}, - {X86::VCVTUQQ2PDZ128rrkz, X86::VCVTUQQ2PDZ128rmbkz, TB_BCAST_SD}, - {X86::VCVTUQQ2PDZ256rrkz, X86::VCVTUQQ2PDZ256rmbkz, TB_BCAST_SD}, - {X86::VCVTUQQ2PDZrrkz, X86::VCVTUQQ2PDZrmbkz, TB_BCAST_SD}, - {X86::VCVTUQQ2PHZ128rrkz, X86::VCVTUQQ2PHZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTUQQ2PHZ256rrkz, X86::VCVTUQQ2PHZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTUQQ2PHZrrkz, X86::VCVTUQQ2PHZrmbkz, TB_BCAST_SH}, - {X86::VCVTUQQ2PSZ128rrkz, X86::VCVTUQQ2PSZ128rmbkz, TB_BCAST_SS}, - {X86::VCVTUQQ2PSZ256rrkz, X86::VCVTUQQ2PSZ256rmbkz, TB_BCAST_SS}, - {X86::VCVTUQQ2PSZrrkz, X86::VCVTUQQ2PSZrmbkz, TB_BCAST_SS}, - {X86::VCVTUW2PHZ128rrkz, X86::VCVTUW2PHZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTUW2PHZ256rrkz, X86::VCVTUW2PHZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTUW2PHZrrkz, X86::VCVTUW2PHZrmbkz, TB_BCAST_SH}, - {X86::VCVTW2PHZ128rrkz, X86::VCVTW2PHZ128rmbkz, TB_BCAST_SH}, - {X86::VCVTW2PHZ256rrkz, X86::VCVTW2PHZ256rmbkz, TB_BCAST_SH}, - {X86::VCVTW2PHZrrkz, X86::VCVTW2PHZrmbkz, TB_BCAST_SH}, + {X86::VCVTUDQ2PDZ128rrkz, X86::VCVTUDQ2PDZ128rmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PDZ256rrkz, X86::VCVTUDQ2PDZ256rmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PDZrrkz, X86::VCVTUDQ2PDZrmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PHZ128rrkz, X86::VCVTUDQ2PHZ128rmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PHZ256rrkz, X86::VCVTUDQ2PHZ256rmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PHZrrkz, X86::VCVTUDQ2PHZrmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PSZ128rrkz, X86::VCVTUDQ2PSZ128rmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PSZ256rrkz, X86::VCVTUDQ2PSZ256rmbkz, TB_BCAST_D}, + {X86::VCVTUDQ2PSZrrkz, X86::VCVTUDQ2PSZrmbkz, TB_BCAST_D}, + {X86::VCVTUQQ2PDZ128rrkz, X86::VCVTUQQ2PDZ128rmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PDZ256rrkz, X86::VCVTUQQ2PDZ256rmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PDZrrkz, X86::VCVTUQQ2PDZrmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZ128rrkz, X86::VCVTUQQ2PHZ128rmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZ256rrkz, X86::VCVTUQQ2PHZ256rmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZrrkz, X86::VCVTUQQ2PHZrmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZ128rrkz, X86::VCVTUQQ2PSZ128rmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZ256rrkz, X86::VCVTUQQ2PSZ256rmbkz, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZrrkz, X86::VCVTUQQ2PSZrmbkz, TB_BCAST_Q}, + {X86::VCVTUW2PHZ128rrkz, X86::VCVTUW2PHZ128rmbkz, TB_BCAST_W}, + {X86::VCVTUW2PHZ256rrkz, X86::VCVTUW2PHZ256rmbkz, TB_BCAST_W}, + {X86::VCVTUW2PHZrrkz, X86::VCVTUW2PHZrmbkz, TB_BCAST_W}, + {X86::VCVTW2PHZ128rrkz, X86::VCVTW2PHZ128rmbkz, TB_BCAST_W}, + {X86::VCVTW2PHZ256rrkz, X86::VCVTW2PHZ256rmbkz, TB_BCAST_W}, + {X86::VCVTW2PHZrrkz, X86::VCVTW2PHZrmbkz, TB_BCAST_W}, {X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD}, {X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD}, {X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD}, @@ -7148,9 +7148,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rmb, TB_BCAST_SD}, {X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rmb, TB_BCAST_SD}, {X86::VMAXCPDZrr, X86::VMAXCPDZrmb, TB_BCAST_SD}, - {X86::VMAXCPHZ128rr, X86::VMAXCPHZ128rmb, TB_BCAST_SS}, - {X86::VMAXCPHZ256rr, X86::VMAXCPHZ256rmb, TB_BCAST_SS}, - {X86::VMAXCPHZrr, X86::VMAXCPHZrmb, TB_BCAST_SS}, + {X86::VMAXCPHZ128rr, X86::VMAXCPHZ128rmb, TB_BCAST_SH}, + {X86::VMAXCPHZ256rr, X86::VMAXCPHZ256rmb, TB_BCAST_SH}, + {X86::VMAXCPHZrr, X86::VMAXCPHZrmb, TB_BCAST_SH}, {X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS}, {X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS}, {X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS}, @@ -7166,9 +7166,9 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VMINCPDZ128rr, X86::VMINCPDZ128rmb, TB_BCAST_SD}, {X86::VMINCPDZ256rr, X86::VMINCPDZ256rmb, TB_BCAST_SD}, {X86::VMINCPDZrr, X86::VMINCPDZrmb, TB_BCAST_SD}, - {X86::VMINCPHZ128rr, X86::VMINCPHZ128rmb, TB_BCAST_SS}, - {X86::VMINCPHZ256rr, X86::VMINCPHZ256rmb, TB_BCAST_SS}, - {X86::VMINCPHZrr, X86::VMINCPHZrmb, TB_BCAST_SS}, + {X86::VMINCPHZ128rr, X86::VMINCPHZ128rmb, TB_BCAST_SH}, + {X86::VMINCPHZ256rr, X86::VMINCPHZ256rmb, TB_BCAST_SH}, + {X86::VMINCPHZrr, X86::VMINCPHZrmb, TB_BCAST_SH}, {X86::VMINCPSZ128rr, X86::VMINCPSZ128rmb, TB_BCAST_SS}, {X86::VMINCPSZ256rr, X86::VMINCPSZ256rmb, TB_BCAST_SS}, {X86::VMINCPSZrr, X86::VMINCPSZrmb, TB_BCAST_SS}, @@ -7442,15 +7442,15 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VPTESTNMQZ128rr, X86::VPTESTNMQZ128rmb, TB_BCAST_Q}, {X86::VPTESTNMQZ256rr, X86::VPTESTNMQZ256rmb, TB_BCAST_Q}, {X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q}, - {X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rmb, TB_BCAST_Q}, - {X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rmb, TB_BCAST_Q}, - {X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrmb, TB_BCAST_Q}, + {X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rmb, TB_BCAST_D}, + {X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rmb, TB_BCAST_D}, + {X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrmb, TB_BCAST_D}, {X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rmb, TB_BCAST_Q}, {X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rmb, TB_BCAST_Q}, {X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrmb, TB_BCAST_Q}, - {X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rmb, TB_BCAST_Q}, - {X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rmb, TB_BCAST_Q}, - {X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrmb, TB_BCAST_Q}, + {X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rmb, TB_BCAST_D}, + {X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rmb, TB_BCAST_D}, + {X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrmb, TB_BCAST_D}, {X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rmb, TB_BCAST_Q}, {X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rmb, TB_BCAST_Q}, {X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrmb, TB_BCAST_Q}, @@ -7610,15 +7610,15 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmbik, TB_BCAST_SS}, {X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmbik, TB_BCAST_SS}, {X86::VCMPPSZrrik, X86::VCMPPSZrmbik, TB_BCAST_SS}, - {X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmbk, TB_BCAST_SD}, - {X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmbk, TB_BCAST_SD}, - {X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmbk, TB_BCAST_SD}, - {X86::VCVTDQ2PHZ128rrk, X86::VCVTDQ2PHZ128rmbk, TB_BCAST_SH}, - {X86::VCVTDQ2PHZ256rrk, X86::VCVTDQ2PHZ256rmbk, TB_BCAST_SH}, - {X86::VCVTDQ2PHZrrk, X86::VCVTDQ2PHZrmbk, TB_BCAST_SH}, - {X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmbk, TB_BCAST_SS}, - {X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmbk, TB_BCAST_SS}, - {X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmbk, TB_BCAST_SS}, + {X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmbk, TB_BCAST_D}, + {X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmbk, TB_BCAST_D}, + {X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmbk, TB_BCAST_D}, + {X86::VCVTDQ2PHZ128rrk, X86::VCVTDQ2PHZ128rmbk, TB_BCAST_D}, + {X86::VCVTDQ2PHZ256rrk, X86::VCVTDQ2PHZ256rmbk, TB_BCAST_D}, + {X86::VCVTDQ2PHZrrk, X86::VCVTDQ2PHZrmbk, TB_BCAST_D}, + {X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmbk, TB_BCAST_D}, + {X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmbk, TB_BCAST_D}, + {X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmbk, TB_BCAST_D}, {X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmbkz, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmbkz, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmbkz, TB_BCAST_SS}, @@ -7631,9 +7631,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTPD2PHZ128rrk, X86::VCVTPD2PHZ128rmbk, TB_BCAST_SD}, {X86::VCVTPD2PHZ256rrk, X86::VCVTPD2PHZ256rmbk, TB_BCAST_SD}, {X86::VCVTPD2PHZrrk, X86::VCVTPD2PHZrmbk, TB_BCAST_SD}, - {X86::VCVTPD2PSZ128rrk, X86::VCVTPD2PSZ128rmbk, TB_BCAST_SS}, - {X86::VCVTPD2PSZ256rrk, X86::VCVTPD2PSZ256rmbk, TB_BCAST_SS}, - {X86::VCVTPD2PSZrrk, X86::VCVTPD2PSZrmbk, TB_BCAST_SS}, + {X86::VCVTPD2PSZ128rrk, X86::VCVTPD2PSZ128rmbk, TB_BCAST_SD}, + {X86::VCVTPD2PSZ256rrk, X86::VCVTPD2PSZ256rmbk, TB_BCAST_SD}, + {X86::VCVTPD2PSZrrk, X86::VCVTPD2PSZrmbk, TB_BCAST_SD}, {X86::VCVTPD2QQZ128rrk, X86::VCVTPD2QQZ128rmbk, TB_BCAST_SD}, {X86::VCVTPD2QQZ256rrk, X86::VCVTPD2QQZ256rmbk, TB_BCAST_SD}, {X86::VCVTPD2QQZrrk, X86::VCVTPD2QQZrmbk, TB_BCAST_SD}, @@ -7685,15 +7685,15 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTPS2UQQZ128rrk, X86::VCVTPS2UQQZ128rmbk, TB_BCAST_SS}, {X86::VCVTPS2UQQZ256rrk, X86::VCVTPS2UQQZ256rmbk, TB_BCAST_SS}, {X86::VCVTPS2UQQZrrk, X86::VCVTPS2UQQZrmbk, TB_BCAST_SS}, - {X86::VCVTQQ2PDZ128rrk, X86::VCVTQQ2PDZ128rmbk, TB_BCAST_SD}, - {X86::VCVTQQ2PDZ256rrk, X86::VCVTQQ2PDZ256rmbk, TB_BCAST_SD}, - {X86::VCVTQQ2PDZrrk, X86::VCVTQQ2PDZrmbk, TB_BCAST_SD}, - {X86::VCVTQQ2PHZ128rrk, X86::VCVTQQ2PHZ128rmbk, TB_BCAST_SH}, - {X86::VCVTQQ2PHZ256rrk, X86::VCVTQQ2PHZ256rmbk, TB_BCAST_SH}, - {X86::VCVTQQ2PHZrrk, X86::VCVTQQ2PHZrmbk, TB_BCAST_SH}, - {X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmbk, TB_BCAST_SS}, - {X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmbk, TB_BCAST_SS}, - {X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmbk, TB_BCAST_SS}, + {X86::VCVTQQ2PDZ128rrk, X86::VCVTQQ2PDZ128rmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PDZ256rrk, X86::VCVTQQ2PDZ256rmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PDZrrk, X86::VCVTQQ2PDZrmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PHZ128rrk, X86::VCVTQQ2PHZ128rmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PHZ256rrk, X86::VCVTQQ2PHZ256rmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PHZrrk, X86::VCVTQQ2PHZrmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmbk, TB_BCAST_Q}, + {X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmbk, TB_BCAST_Q}, {X86::VCVTTPD2DQZ128rrk, X86::VCVTTPD2DQZ128rmbk, TB_BCAST_SD}, {X86::VCVTTPD2DQZ256rrk, X86::VCVTTPD2DQZ256rmbk, TB_BCAST_SD}, {X86::VCVTTPD2DQZrrk, X86::VCVTTPD2DQZrmbk, TB_BCAST_SD}, @@ -7736,30 +7736,30 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTTPS2UQQZ128rrk, X86::VCVTTPS2UQQZ128rmbk, TB_BCAST_SS}, {X86::VCVTTPS2UQQZ256rrk, X86::VCVTTPS2UQQZ256rmbk, TB_BCAST_SS}, {X86::VCVTTPS2UQQZrrk, X86::VCVTTPS2UQQZrmbk, TB_BCAST_SS}, - {X86::VCVTUDQ2PDZ128rrk, X86::VCVTUDQ2PDZ128rmbk, TB_BCAST_SD}, - {X86::VCVTUDQ2PDZ256rrk, X86::VCVTUDQ2PDZ256rmbk, TB_BCAST_SD}, - {X86::VCVTUDQ2PDZrrk, X86::VCVTUDQ2PDZrmbk, TB_BCAST_SD}, - {X86::VCVTUDQ2PHZ128rrk, X86::VCVTUDQ2PHZ128rmbk, TB_BCAST_SH}, - {X86::VCVTUDQ2PHZ256rrk, X86::VCVTUDQ2PHZ256rmbk, TB_BCAST_SH}, - {X86::VCVTUDQ2PHZrrk, X86::VCVTUDQ2PHZrmbk, TB_BCAST_SH}, - {X86::VCVTUDQ2PSZ128rrk, X86::VCVTUDQ2PSZ128rmbk, TB_BCAST_SS}, - {X86::VCVTUDQ2PSZ256rrk, X86::VCVTUDQ2PSZ256rmbk, TB_BCAST_SS}, - {X86::VCVTUDQ2PSZrrk, X86::VCVTUDQ2PSZrmbk, TB_BCAST_SS}, - {X86::VCVTUQQ2PDZ128rrk, X86::VCVTUQQ2PDZ128rmbk, TB_BCAST_SD}, - {X86::VCVTUQQ2PDZ256rrk, X86::VCVTUQQ2PDZ256rmbk, TB_BCAST_SD}, - {X86::VCVTUQQ2PDZrrk, X86::VCVTUQQ2PDZrmbk, TB_BCAST_SD}, - {X86::VCVTUQQ2PHZ128rrk, X86::VCVTUQQ2PHZ128rmbk, TB_BCAST_SH}, - {X86::VCVTUQQ2PHZ256rrk, X86::VCVTUQQ2PHZ256rmbk, TB_BCAST_SH}, - {X86::VCVTUQQ2PHZrrk, X86::VCVTUQQ2PHZrmbk, TB_BCAST_SH}, - {X86::VCVTUQQ2PSZ128rrk, X86::VCVTUQQ2PSZ128rmbk, TB_BCAST_SS}, - {X86::VCVTUQQ2PSZ256rrk, X86::VCVTUQQ2PSZ256rmbk, TB_BCAST_SS}, - {X86::VCVTUQQ2PSZrrk, X86::VCVTUQQ2PSZrmbk, TB_BCAST_SS}, - {X86::VCVTUW2PHZ128rrk, X86::VCVTUW2PHZ128rmbk, TB_BCAST_SH}, - {X86::VCVTUW2PHZ256rrk, X86::VCVTUW2PHZ256rmbk, TB_BCAST_SH}, - {X86::VCVTUW2PHZrrk, X86::VCVTUW2PHZrmbk, TB_BCAST_SH}, - {X86::VCVTW2PHZ128rrk, X86::VCVTW2PHZ128rmbk, TB_BCAST_SH}, - {X86::VCVTW2PHZ256rrk, X86::VCVTW2PHZ256rmbk, TB_BCAST_SH}, - {X86::VCVTW2PHZrrk, X86::VCVTW2PHZrmbk, TB_BCAST_SH}, + {X86::VCVTUDQ2PDZ128rrk, X86::VCVTUDQ2PDZ128rmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PDZ256rrk, X86::VCVTUDQ2PDZ256rmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PDZrrk, X86::VCVTUDQ2PDZrmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PHZ128rrk, X86::VCVTUDQ2PHZ128rmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PHZ256rrk, X86::VCVTUDQ2PHZ256rmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PHZrrk, X86::VCVTUDQ2PHZrmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PSZ128rrk, X86::VCVTUDQ2PSZ128rmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PSZ256rrk, X86::VCVTUDQ2PSZ256rmbk, TB_BCAST_D}, + {X86::VCVTUDQ2PSZrrk, X86::VCVTUDQ2PSZrmbk, TB_BCAST_D}, + {X86::VCVTUQQ2PDZ128rrk, X86::VCVTUQQ2PDZ128rmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PDZ256rrk, X86::VCVTUQQ2PDZ256rmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PDZrrk, X86::VCVTUQQ2PDZrmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZ128rrk, X86::VCVTUQQ2PHZ128rmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZ256rrk, X86::VCVTUQQ2PHZ256rmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PHZrrk, X86::VCVTUQQ2PHZrmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZ128rrk, X86::VCVTUQQ2PSZ128rmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZ256rrk, X86::VCVTUQQ2PSZ256rmbk, TB_BCAST_Q}, + {X86::VCVTUQQ2PSZrrk, X86::VCVTUQQ2PSZrmbk, TB_BCAST_Q}, + {X86::VCVTUW2PHZ128rrk, X86::VCVTUW2PHZ128rmbk, TB_BCAST_W}, + {X86::VCVTUW2PHZ256rrk, X86::VCVTUW2PHZ256rmbk, TB_BCAST_W}, + {X86::VCVTUW2PHZrrk, X86::VCVTUW2PHZrmbk, TB_BCAST_W}, + {X86::VCVTW2PHZ128rrk, X86::VCVTW2PHZ128rmbk, TB_BCAST_W}, + {X86::VCVTW2PHZ256rrk, X86::VCVTW2PHZ256rmbk, TB_BCAST_W}, + {X86::VCVTW2PHZrrk, X86::VCVTW2PHZrmbk, TB_BCAST_W}, {X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmbkz, TB_BCAST_SD}, {X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmbkz, TB_BCAST_SD}, {X86::VDIVPDZrrkz, X86::VDIVPDZrmbkz, TB_BCAST_SD}, @@ -7981,9 +7981,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmbkz, TB_BCAST_SD}, {X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmbkz, TB_BCAST_SD}, {X86::VMAXCPDZrrkz, X86::VMAXCPDZrmbkz, TB_BCAST_SD}, - {X86::VMAXCPHZ128rrkz, X86::VMAXCPHZ128rmbkz, TB_BCAST_SS}, - {X86::VMAXCPHZ256rrkz, X86::VMAXCPHZ256rmbkz, TB_BCAST_SS}, - {X86::VMAXCPHZrrkz, X86::VMAXCPHZrmbkz, TB_BCAST_SS}, + {X86::VMAXCPHZ128rrkz, X86::VMAXCPHZ128rmbkz, TB_BCAST_SH}, + {X86::VMAXCPHZ256rrkz, X86::VMAXCPHZ256rmbkz, TB_BCAST_SH}, + {X86::VMAXCPHZrrkz, X86::VMAXCPHZrmbkz, TB_BCAST_SH}, {X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmbkz, TB_BCAST_SS}, {X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmbkz, TB_BCAST_SS}, {X86::VMAXCPSZrrkz, X86::VMAXCPSZrmbkz, TB_BCAST_SS}, @@ -7999,9 +7999,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmbkz, TB_BCAST_SD}, {X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmbkz, TB_BCAST_SD}, {X86::VMINCPDZrrkz, X86::VMINCPDZrmbkz, TB_BCAST_SD}, - {X86::VMINCPHZ128rrkz, X86::VMINCPHZ128rmbkz, TB_BCAST_SS}, - {X86::VMINCPHZ256rrkz, X86::VMINCPHZ256rmbkz, TB_BCAST_SS}, - {X86::VMINCPHZrrkz, X86::VMINCPHZrmbkz, TB_BCAST_SS}, + {X86::VMINCPHZ128rrkz, X86::VMINCPHZ128rmbkz, TB_BCAST_SH}, + {X86::VMINCPHZ256rrkz, X86::VMINCPHZ256rmbkz, TB_BCAST_SH}, + {X86::VMINCPHZrrkz, X86::VMINCPHZrmbkz, TB_BCAST_SH}, {X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmbkz, TB_BCAST_SS}, {X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmbkz, TB_BCAST_SS}, {X86::VMINCPSZrrkz, X86::VMINCPSZrmbkz, TB_BCAST_SS}, @@ -8095,15 +8095,15 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmbk, TB_BCAST_Q}, {X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmbk, TB_BCAST_Q}, {X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmbk, TB_BCAST_Q}, - {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128mb, TB_BCAST_SD}, - {X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256mb, TB_BCAST_SD}, - {X86::VPDPBUSDSZr, X86::VPDPBUSDSZmb, TB_BCAST_SD}, + {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128mb, TB_BCAST_D}, + {X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256mb, TB_BCAST_D}, + {X86::VPDPBUSDSZr, X86::VPDPBUSDSZmb, TB_BCAST_D}, {X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128mb, TB_BCAST_D}, {X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256mb, TB_BCAST_D}, {X86::VPDPBUSDZr, X86::VPDPBUSDZmb, TB_BCAST_D}, - {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128mb, TB_BCAST_SD}, - {X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256mb, TB_BCAST_SD}, - {X86::VPDPWSSDSZr, X86::VPDPWSSDSZmb, TB_BCAST_SD}, + {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128mb, TB_BCAST_D}, + {X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256mb, TB_BCAST_D}, + {X86::VPDPWSSDSZr, X86::VPDPWSSDSZmb, TB_BCAST_D}, {X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128mb, TB_BCAST_D}, {X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256mb, TB_BCAST_D}, {X86::VPDPWSSDZr, X86::VPDPWSSDZmb, TB_BCAST_D}, @@ -8329,15 +8329,15 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VPTESTNMQZ128rrk, X86::VPTESTNMQZ128rmbk, TB_BCAST_Q}, {X86::VPTESTNMQZ256rrk, X86::VPTESTNMQZ256rmbk, TB_BCAST_Q}, {X86::VPTESTNMQZrrk, X86::VPTESTNMQZrmbk, TB_BCAST_Q}, - {X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmbkz, TB_BCAST_Q}, - {X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmbkz, TB_BCAST_Q}, - {X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmbkz, TB_BCAST_Q}, + {X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmbkz, TB_BCAST_D}, + {X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmbkz, TB_BCAST_D}, + {X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmbkz, TB_BCAST_D}, {X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmbkz, TB_BCAST_Q}, {X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmbkz, TB_BCAST_Q}, {X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmbkz, TB_BCAST_Q}, - {X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmbkz, TB_BCAST_Q}, - {X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmbkz, TB_BCAST_Q}, - {X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmbkz, TB_BCAST_Q}, + {X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmbkz, TB_BCAST_D}, + {X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmbkz, TB_BCAST_D}, + {X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmbkz, TB_BCAST_D}, {X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmbkz, TB_BCAST_Q}, {X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmbkz, TB_BCAST_Q}, {X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmbkz, TB_BCAST_Q}, @@ -8863,9 +8863,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmbk, TB_BCAST_SD}, {X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmbk, TB_BCAST_SD}, {X86::VMAXCPDZrrk, X86::VMAXCPDZrmbk, TB_BCAST_SD}, - {X86::VMAXCPHZ128rrk, X86::VMAXCPHZ128rmbk, TB_BCAST_SS}, - {X86::VMAXCPHZ256rrk, X86::VMAXCPHZ256rmbk, TB_BCAST_SS}, - {X86::VMAXCPHZrrk, X86::VMAXCPHZrmbk, TB_BCAST_SS}, + {X86::VMAXCPHZ128rrk, X86::VMAXCPHZ128rmbk, TB_BCAST_SH}, + {X86::VMAXCPHZ256rrk, X86::VMAXCPHZ256rmbk, TB_BCAST_SH}, + {X86::VMAXCPHZrrk, X86::VMAXCPHZrmbk, TB_BCAST_SH}, {X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmbk, TB_BCAST_SS}, {X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmbk, TB_BCAST_SS}, {X86::VMAXCPSZrrk, X86::VMAXCPSZrmbk, TB_BCAST_SS}, @@ -8881,9 +8881,9 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmbk, TB_BCAST_SD}, {X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmbk, TB_BCAST_SD}, {X86::VMINCPDZrrk, X86::VMINCPDZrmbk, TB_BCAST_SD}, - {X86::VMINCPHZ128rrk, X86::VMINCPHZ128rmbk, TB_BCAST_SS}, - {X86::VMINCPHZ256rrk, X86::VMINCPHZ256rmbk, TB_BCAST_SS}, - {X86::VMINCPHZrrk, X86::VMINCPHZrmbk, TB_BCAST_SS}, + {X86::VMINCPHZ128rrk, X86::VMINCPHZ128rmbk, TB_BCAST_SH}, + {X86::VMINCPHZ256rrk, X86::VMINCPHZ256rmbk, TB_BCAST_SH}, + {X86::VMINCPHZrrk, X86::VMINCPHZrmbk, TB_BCAST_SH}, {X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmbk, TB_BCAST_SS}, {X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmbk, TB_BCAST_SS}, {X86::VMINCPSZrrk, X86::VMINCPSZrmbk, TB_BCAST_SS}, @@ -8935,24 +8935,24 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VPANDQZ128rrk, X86::VPANDQZ128rmbk, TB_BCAST_Q}, {X86::VPANDQZ256rrk, X86::VPANDQZ256rmbk, TB_BCAST_Q}, {X86::VPANDQZrrk, X86::VPANDQZrmbk, TB_BCAST_Q}, - {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mbk, TB_BCAST_SD}, - {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mbkz, TB_BCAST_SD}, - {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mbk, TB_BCAST_SD}, - {X86::VPDPBUSDSZ256rkz, X86::VPDPBUSDSZ256mbkz, TB_BCAST_SD}, - {X86::VPDPBUSDSZrk, X86::VPDPBUSDSZmbk, TB_BCAST_SD}, - {X86::VPDPBUSDSZrkz, X86::VPDPBUSDSZmbkz, TB_BCAST_SD}, + {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mbk, TB_BCAST_D}, + {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mbk, TB_BCAST_D}, + {X86::VPDPBUSDSZ256rkz, X86::VPDPBUSDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPBUSDSZrk, X86::VPDPBUSDSZmbk, TB_BCAST_D}, + {X86::VPDPBUSDSZrkz, X86::VPDPBUSDSZmbkz, TB_BCAST_D}, {X86::VPDPBUSDZ128rk, X86::VPDPBUSDZ128mbk, TB_BCAST_D}, {X86::VPDPBUSDZ128rkz, X86::VPDPBUSDZ128mbkz, TB_BCAST_D}, {X86::VPDPBUSDZ256rk, X86::VPDPBUSDZ256mbk, TB_BCAST_D}, {X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mbkz, TB_BCAST_D}, {X86::VPDPBUSDZrk, X86::VPDPBUSDZmbk, TB_BCAST_D}, {X86::VPDPBUSDZrkz, X86::VPDPBUSDZmbkz, TB_BCAST_D}, - {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mbk, TB_BCAST_SD}, - {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mbkz, TB_BCAST_SD}, - {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mbk, TB_BCAST_SD}, - {X86::VPDPWSSDSZ256rkz, X86::VPDPWSSDSZ256mbkz, TB_BCAST_SD}, - {X86::VPDPWSSDSZrk, X86::VPDPWSSDSZmbk, TB_BCAST_SD}, - {X86::VPDPWSSDSZrkz, X86::VPDPWSSDSZmbkz, TB_BCAST_SD}, + {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mbk, TB_BCAST_D}, + {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mbk, TB_BCAST_D}, + {X86::VPDPWSSDSZ256rkz, X86::VPDPWSSDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPWSSDSZrk, X86::VPDPWSSDSZmbk, TB_BCAST_D}, + {X86::VPDPWSSDSZrkz, X86::VPDPWSSDSZmbkz, TB_BCAST_D}, {X86::VPDPWSSDZ128rk, X86::VPDPWSSDZ128mbk, TB_BCAST_D}, {X86::VPDPWSSDZ128rkz, X86::VPDPWSSDZ128mbkz, TB_BCAST_D}, {X86::VPDPWSSDZ256rk, X86::VPDPWSSDZ256mbk, TB_BCAST_D}, @@ -9162,15 +9162,15 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VPTERNLOGQZ256rrikz, X86::VPTERNLOGQZ256rmbikz, TB_BCAST_Q}, {X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmbik, TB_BCAST_Q}, {X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmbikz, TB_BCAST_Q}, - {X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmbk, TB_BCAST_Q}, - {X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmbk, TB_BCAST_Q}, - {X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmbk, TB_BCAST_Q}, + {X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmbk, TB_BCAST_D}, + {X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmbk, TB_BCAST_D}, + {X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmbk, TB_BCAST_D}, {X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmbk, TB_BCAST_Q}, {X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmbk, TB_BCAST_Q}, {X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmbk, TB_BCAST_Q}, - {X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmbk, TB_BCAST_Q}, - {X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmbk, TB_BCAST_Q}, - {X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmbk, TB_BCAST_Q}, + {X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmbk, TB_BCAST_D}, + {X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmbk, TB_BCAST_D}, + {X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmbk, TB_BCAST_D}, {X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmbk, TB_BCAST_Q}, {X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmbk, TB_BCAST_Q}, {X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmbk, TB_BCAST_Q}, diff --git a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll new file mode 100644 index 00000000000000..1e1669b29b0db6 --- /dev/null +++ b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -p argpromotion -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + +@f = dso_local global { i16, i64 } { i16 1, i64 0 }, align 8 + +; Test case for https://github.com/llvm/llvm-project/issues/84807. + +; Make sure the loads from @callee are not moved to @caller, as the store +; in %then may aliases to load from %q. + +define i32 @caller1(i1 %c) { +; CHECK-LABEL: define i32 @caller1( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @callee1(ptr noundef nonnull @f, i1 [[C]]) +; CHECK-NEXT: ret i32 0 +; +entry: + call void @callee1(ptr noundef nonnull @f, i1 %c) + ret i32 0 +} + +define internal void @callee1(ptr nocapture noundef readonly %q, i1 %c) { +; CHECK-LABEL: define internal void @callee1( +; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]] +; CHECK: then: +; CHECK-NEXT: store i16 123, ptr @f, align 8 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8 +; CHECK-NEXT: [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8 +; CHECK-NEXT: call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]]) +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %then, label %exit + +then: + store i16 123, ptr @f, align 8 + br label %exit + +exit: + %l.0 = load i16, ptr %q, align 8 + %gep.8 = getelementptr inbounds i8, ptr %q, i64 8 + %l.1 = load i64, ptr %gep.8, align 8 + call void @use(i16 %l.0, i64 %l.1) + ret void + + uselistorder ptr %q, { 1, 0 } +} + +; Same as @caller1/callee2, but with default uselist order. +define i32 @caller2(i1 %c) { +; CHECK-LABEL: define i32 @caller2( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @callee2(ptr noundef nonnull @f, i1 [[C]]) +; CHECK-NEXT: ret i32 0 +; +entry: + call void @callee2(ptr noundef nonnull @f, i1 %c) + ret i32 0 +} + +define internal void @callee2(ptr nocapture noundef readonly %q, i1 %c) { +; CHECK-LABEL: define internal void @callee2( +; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]] +; CHECK: then: +; CHECK-NEXT: store i16 123, ptr @f, align 8 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8 +; CHECK-NEXT: [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8 +; CHECK-NEXT: call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]]) +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %then, label %exit + +then: + store i16 123, ptr @f, align 8 + br label %exit + +exit: + %l.0 = load i16, ptr %q, align 8 + %gep.8 = getelementptr inbounds i8, ptr %q, i64 8 + %l.1 = load i64, ptr %gep.8, align 8 + call void @use(i16 %l.0, i64 %l.1) + ret void +} + +declare void @use(i16, i64) diff --git a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll index 0ba114117ceec6..ee36f949529d4f 100644 --- a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll +++ b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll @@ -142,7 +142,7 @@ define double @ret_fpext_f32_to_f64_nosub(float nofpclass(sub) %arg0) { } define double @ret_fpext_f32_to_f64_nonorm(float nofpclass(norm) %arg0) { -; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_f32_to_f64_nonorm +; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_f32_to_f64_nonorm ; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[EXT:%.*]] = fpext float [[ARG0]] to double ; CHECK-NEXT: ret double [[EXT]] @@ -482,7 +482,37 @@ define double @ret_fpext_bf16_f64_nosub(bfloat nofpclass(sub) %arg0) { } define double @ret_fpext_bf16_f64_nonorm(bfloat nofpclass(norm) %arg0) { -; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_bf16_f64_nonorm +; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_bf16_f64_nonorm +; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[EXT:%.*]] = fpext bfloat [[ARG0]] to double +; CHECK-NEXT: ret double [[EXT]] +; + %ext = fpext bfloat %arg0 to double + ret double %ext +} + +define double @ret_fpext_bf16_f64_nonorm_psub(bfloat nofpclass(norm psub) %arg0) { +; CHECK-LABEL: define nofpclass(sub pnorm) double @ret_fpext_bf16_f64_nonorm_psub +; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[EXT:%.*]] = fpext bfloat [[ARG0]] to double +; CHECK-NEXT: ret double [[EXT]] +; + %ext = fpext bfloat %arg0 to double + ret double %ext +} + +define double @ret_fpext_bf16_f64_nonorm_nsub(bfloat nofpclass(norm nsub) %arg0) { +; CHECK-LABEL: define nofpclass(sub nnorm) double @ret_fpext_bf16_f64_nonorm_nsub +; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[EXT:%.*]] = fpext bfloat [[ARG0]] to double +; CHECK-NEXT: ret double [[EXT]] +; + %ext = fpext bfloat %arg0 to double + ret double %ext +} + +define double @ret_fpext_bf16_f64_nonorm_sub(bfloat nofpclass(norm sub) %arg0) { +; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_bf16_f64_nonorm_sub ; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[EXT:%.*]] = fpext bfloat [[ARG0]] to double ; CHECK-NEXT: ret double [[EXT]] diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll index ff5cef7e781fe6..25dfb3c53a077b 100644 --- a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll +++ b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll @@ -211,6 +211,29 @@ else: ret i32 %l } +define i32 @sub10_else_drop_nuw(i32 %a) { +; CHECK-LABEL: @sub10_else_drop_nuw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L:%.*]] = sub i32 [[A:%.*]], 10 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[L]], 0 +; CHECK-NEXT: br i1 [[TMP0]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: ret i32 0 +; CHECK: else: +; CHECK-NEXT: ret i32 [[L]] +; +entry: + %c = icmp eq i32 %a, 10 + br i1 %c, label %then, label %else + +then: + ret i32 0 + +else: + %l = sub nuw i32 %a, 10 + ret i32 %l +} + define i32 @subm10_then(i32 %a) { ; CHECK-LABEL: @subm10_then( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll new file mode 100644 index 00000000000000..a6909d14913494 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -codegenprepare -S -mtriple=riscv64 < %s | FileCheck %s + +define i8 @hoist_add(i8 %x) { +; CHECK-LABEL: define i8 @hoist_add( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INC:%.*]] = add i8 [[X]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i8 [[INC]], 0 +; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i8 [[RETVAL]] +; +entry: + %cmp = icmp eq i8 %x, -1 + br i1 %cmp, label %exit, label %if.then + +if.then: + %inc = add nuw nsw i8 %x, 1 + br label %exit + +exit: + %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ] + ret i8 %retval +} + +define i8 @hoist_lshr(i8 %x) { +; CHECK-LABEL: define i8 @hoist_lshr( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INC:%.*]] = lshr i8 [[X]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i8 [[INC]], 0 +; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i8 [[RETVAL]] +; +entry: + %cmp = icmp ult i8 %x, 8 + br i1 %cmp, label %exit, label %if.then + +if.then: + %inc = lshr exact i8 %x, 3 + br label %exit + +exit: + %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ] + ret i8 %retval +} + +define i8 @nomove_add(i8 %x) { +; CHECK-LABEL: define i8 @nomove_add( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INC:%.*]] = add i8 [[X]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i8 [[INC]], 0 +; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i8 [[RETVAL]] +; +entry: + %inc = add nuw nsw i8 %x, 1 + %cmp = icmp eq i8 %x, -1 + br i1 %cmp, label %exit, label %if.then + +if.then: + br label %exit + +exit: + %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ] + ret i8 %retval +} diff --git a/llvm/test/Transforms/ConstraintElimination/minmax.ll b/llvm/test/Transforms/ConstraintElimination/minmax.ll index a31cf6845ad67d..82b932f14c4ffa 100644 --- a/llvm/test/Transforms/ConstraintElimination/minmax.ll +++ b/llvm/test/Transforms/ConstraintElimination/minmax.ll @@ -306,7 +306,9 @@ define i1 @smin_branchless(i32 %x, i32 %y) { ; CHECK-SAME: (i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: [[RET:%.*]] = xor i1 true, false +; CHECK-NEXT: [[CMP1:%.*]] = icmp sle i32 [[MIN]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[MIN]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = xor i1 [[CMP1]], [[CMP2]] ; CHECK-NEXT: ret i1 [[RET]] ; entry: diff --git a/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll new file mode 100644 index 00000000000000..6d1d95ec4fdba7 --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -p constraint-elimination -S %s | FileCheck %s + +; Tests for https://github.com/llvm/llvm-project/issues/78621. + +define i1 @umin_not_used(i32 %arg) { +; CHECK-LABEL: define i1 @umin_not_used( +; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp slt i32 [[ARG]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80) +; CHECK-NEXT: [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3 +; CHECK-NEXT: ret i1 [[ICMP]] +; + %icmp = icmp slt i32 %arg, 0 + %shl = shl nuw nsw i32 %arg, 3 + call i32 @llvm.umin.i32(i32 %shl, i32 80) + %cmp2 = shl nuw nsw i32 %arg, 3 + ret i1 %icmp +} + +define i1 @umin_poison_is_UB_via_call(i32 %arg) { +; CHECK-LABEL: define i1 @umin_poison_is_UB_via_call( +; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3 +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80) +; CHECK-NEXT: call void @noundef(i32 noundef [[MIN]]) +; CHECK-NEXT: [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3 +; CHECK-NEXT: ret i1 false +; + %icmp = icmp slt i32 %arg, 0 + %shl = shl nuw nsw i32 %arg, 3 + %min = call i32 @llvm.umin.i32(i32 %shl, i32 80) + call void @noundef(i32 noundef %min) + %cmp2 = shl nuw nsw i32 %arg, 3 + ret i1 %icmp +} + +define i1 @umin_poison_call_before_UB(i32 %arg) { +; CHECK-LABEL: define i1 @umin_poison_call_before_UB( +; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp slt i32 [[ARG]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3 +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80) +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: call void @noundef(i32 noundef [[MIN]]) +; CHECK-NEXT: [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3 +; CHECK-NEXT: ret i1 [[ICMP]] +; + %icmp = icmp slt i32 %arg, 0 + %shl = shl nuw nsw i32 %arg, 3 + %min = call i32 @llvm.umin.i32(i32 %shl, i32 80) + call void @fn() + call void @noundef(i32 noundef %min) + %cmp2 = shl nuw nsw i32 %arg, 3 + ret i1 %icmp +} + +declare i32 @llvm.umin.i32(i32, i32) #0 + +declare void @noundef(i32 noundef) +declare void @fn() diff --git a/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll b/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll new file mode 100644 index 00000000000000..ee9bd6912e2ae4 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=dse < %s | FileCheck %s +; +; DSE kills `store i32 44, ptr %struct.byte.4, align 4` but should not kill +; `call void @llvm.memset.p0.i64(...)` because it has a clobber read: +; `%ret = load ptr, ptr %struct.byte.8` + + +%struct.type = type { ptr, ptr } + +define ptr @foo(ptr noundef %ptr) { +; CHECK-LABEL: define ptr @foo( +; CHECK-SAME: ptr noundef [[PTR:%.*]]) { +; CHECK-NEXT: [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false) +; CHECK-NEXT: store i32 43, ptr [[STRUCT_BYTE_8]], align 4 +; CHECK-NEXT: [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]] +; CHECK-NEXT: ret ptr [[RET]] +; + %struct.alloca = alloca %struct.type, align 8 + call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind + %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8 + ; Set %struct.alloca[8, 16) to 42. + call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false) + ; Set %struct.alloca[8, 12) to 43. + store i32 43, ptr %struct.byte.8, align 4 + ; Set %struct.alloca[4, 8) to 44. + %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4 + store i32 44, ptr %struct.byte.4, align 4 + ; Return %struct.alloca[8, 16). + %ret = load ptr, ptr %struct.byte.8 + call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind + ret ptr %ret +} + +; Set of tests based on @foo, but where the memset's operands cannot be erased +; due to other uses. Instead, they contain a number of removable MemoryDefs; +; with non-void types result types. + +define ptr @foo_with_removable_malloc() { +; CHECK-LABEL: define ptr @foo_with_removable_malloc() { +; CHECK-NEXT: [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]] +; CHECK-NEXT: [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4 +; CHECK-NEXT: [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false) +; CHECK-NEXT: store i32 43, ptr [[STRUCT_BYTE_8]], align 4 +; CHECK-NEXT: [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8 +; CHECK-NEXT: call void @readnone(ptr [[STRUCT_BYTE_4]]) +; CHECK-NEXT: call void @readnone(ptr [[STRUCT_BYTE_8]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]] +; CHECK-NEXT: ret ptr [[RET]] +; + %struct.alloca = alloca %struct.type, align 8 + call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind + %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4 + %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8 + + ; Set of removable memory deffs + %m2 = tail call ptr @malloc(i64 4) + %m1 = tail call ptr @malloc(i64 4) + store i32 0, ptr %struct.byte.8 + store i32 0, ptr %struct.byte.8 + store i32 123, ptr %m1 + store i32 123, ptr %m2 + + ; Set %struct.alloca[8, 16) to 42. + call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false) + ; Set %struct.alloca[8, 12) to 43. + store i32 43, ptr %struct.byte.8, align 4 + ; Set %struct.alloca[4, 8) to 44. + store i32 44, ptr %struct.byte.4, align 4 + ; Return %struct.alloca[8, 16). + %ret = load ptr, ptr %struct.byte.8 + call void @readnone(ptr %struct.byte.4); + call void @readnone(ptr %struct.byte.8); + call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind + ret ptr %ret +} + +define ptr @foo_with_removable_malloc_free() { +; CHECK-LABEL: define ptr @foo_with_removable_malloc_free() { +; CHECK-NEXT: [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8 +; CHECK-NEXT: [[M1:%.*]] = tail call ptr @malloc(i64 4) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]] +; CHECK-NEXT: [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4 +; CHECK-NEXT: [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8 +; CHECK-NEXT: [[M2:%.*]] = tail call ptr @malloc(i64 4) +; CHECK-NEXT: call void @free(ptr [[M1]]) +; CHECK-NEXT: call void @free(ptr [[M2]]) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false) +; CHECK-NEXT: store i32 43, ptr [[STRUCT_BYTE_8]], align 4 +; CHECK-NEXT: [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8 +; CHECK-NEXT: call void @readnone(ptr [[STRUCT_BYTE_4]]) +; CHECK-NEXT: call void @readnone(ptr [[STRUCT_BYTE_8]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]] +; CHECK-NEXT: ret ptr [[RET]] +; + %struct.alloca = alloca %struct.type, align 8 + %m1 = tail call ptr @malloc(i64 4) + call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind + %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4 + %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8 + + store i32 0, ptr %struct.byte.4 + store i32 0, ptr %struct.byte.8 + %m2 = tail call ptr @malloc(i64 4) + store i32 123, ptr %m1 + call void @free(ptr %m1); + store i32 123, ptr %m2 + call void @free(ptr %m2); + + ; Set %struct.alloca[8, 16) to 42. + call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false) + ; Set %struct.alloca[8, 12) to 43. + store i32 43, ptr %struct.byte.8, align 4 + ; Set %struct.alloca[4, 8) to 44. + store i32 44, ptr %struct.byte.4, align 4 + ; Return %struct.alloca[8, 16). + %ret = load ptr, ptr %struct.byte.8 + call void @readnone(ptr %struct.byte.4); + call void @readnone(ptr %struct.byte.8); + call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind + ret ptr %ret +} + +define ptr @foo_with_malloc_to_calloc() { +; CHECK-LABEL: define ptr @foo_with_malloc_to_calloc() { +; CHECK-NEXT: [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]] +; CHECK-NEXT: [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8 +; CHECK-NEXT: [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4 +; CHECK-NEXT: [[CALLOC1:%.*]] = call ptr @calloc(i64 1, i64 4) +; CHECK-NEXT: [[CALLOC:%.*]] = call ptr @calloc(i64 1, i64 4) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false) +; CHECK-NEXT: store i32 43, ptr [[STRUCT_BYTE_8]], align 4 +; CHECK-NEXT: [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8 +; CHECK-NEXT: call void @readnone(ptr [[STRUCT_BYTE_4]]) +; CHECK-NEXT: call void @readnone(ptr [[STRUCT_BYTE_8]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]] +; CHECK-NEXT: call void @use(ptr [[CALLOC1]]) +; CHECK-NEXT: call void @use(ptr [[CALLOC]]) +; CHECK-NEXT: ret ptr [[RET]] +; + %struct.alloca = alloca %struct.type, align 8 + call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind + %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8 + %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4 + + ; Set of removable memory deffs + %m1 = tail call ptr @malloc(i64 4) + %m2 = tail call ptr @malloc(i64 4) + store i32 0, ptr %struct.byte.4 + store i32 0, ptr %struct.byte.8 + call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %m2, i8 0, i64 4, i1 false) + call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %m1, i8 0, i64 4, i1 false) + + ; Set %struct.alloca[8, 16) to 42. + call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false) + ; Set %struct.alloca[8, 12) to 43. + store i32 43, ptr %struct.byte.8, align 4 + ; Set %struct.alloca[4, 8) to 44. + store i32 44, ptr %struct.byte.4, align 4 + ; Return %struct.alloca[8, 16). + %ret = load ptr, ptr %struct.byte.8 + call void @readnone(ptr %struct.byte.4); + call void @readnone(ptr %struct.byte.8); + call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind + call void @use(ptr %m1) + call void @use(ptr %m2) + ret ptr %ret +} + +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) + +declare noalias ptr @malloc(i64) willreturn allockind("alloc,uninitialized") "alloc-family"="malloc" +declare void @readnone(ptr) readnone nounwind +declare void @free(ptr nocapture) allockind("free") "alloc-family"="malloc" + +declare void @use(ptr) diff --git a/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll b/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll new file mode 100644 index 00000000000000..60a010cd49ceda --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -p dse -S %s | FileCheck %s + +target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" + +define void @widget(ptr %a) { +; CHECK-LABEL: define void @widget( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[CALL1:%.*]] = tail call noalias ptr @malloc(i64 0) +; CHECK-NEXT: store ptr [[CALL1]], ptr [[A]], align 8 +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[A]], align 8 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 8 +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr [[CALL1]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR3:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 1 +; CHECK-NEXT: [[GETELEMENTPTR4:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 8 +; CHECK-NEXT: store i16 0, ptr [[GETELEMENTPTR4]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR5:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 12 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[LOAD6:%.*]] = load i32, ptr inttoptr (i64 4 to ptr), align 4 +; CHECK-NEXT: br label [[BB48:%.*]] +; CHECK: bb7: +; CHECK-NEXT: br label [[BB9:%.*]] +; CHECK: bb8: +; CHECK-NEXT: br label [[BB53:%.*]] +; CHECK: bb9: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[CALL1]], [[BB7:%.*]] ], [ [[A]], [[BB43:%.*]] ] +; CHECK-NEXT: [[GETELEMENTPTR10:%.*]] = getelementptr i8, ptr [[PHI]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR11:%.*]] = getelementptr i8, ptr [[PHI]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR12:%.*]] = getelementptr i8, ptr [[PHI]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR13:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 1 +; CHECK-NEXT: store i8 0, ptr [[CALL1]], align 1 +; CHECK-NEXT: br label [[BB29:%.*]] +; CHECK: bb14: +; CHECK-NEXT: [[GETELEMENTPTR15:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR10]], i64 8 +; CHECK-NEXT: [[LOAD16:%.*]] = load i16, ptr [[CALL1]], align 4 +; CHECK-NEXT: br i1 false, label [[BB22:%.*]], label [[BB17:%.*]] +; CHECK: bb17: +; CHECK-NEXT: [[GETELEMENTPTR18:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR11]], i64 8 +; CHECK-NEXT: [[LOAD19:%.*]] = load i16, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR20:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 8 +; CHECK-NEXT: store i16 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR21:%.*]] = getelementptr i8, ptr [[PHI]], i64 0 +; CHECK-NEXT: br label [[BB25:%.*]] +; CHECK: bb22: +; CHECK-NEXT: [[GETELEMENTPTR23:%.*]] = getelementptr i8, ptr [[PHI]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR24:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR23]], i64 12 +; CHECK-NEXT: br label [[BB25]] +; CHECK: bb25: +; CHECK-NEXT: [[PHI26:%.*]] = phi ptr [ [[A]], [[BB17]] ], [ [[CALL1]], [[BB22]] ] +; CHECK-NEXT: [[PHI27:%.*]] = phi ptr [ [[CALL1]], [[BB17]] ], [ [[CALL1]], [[BB22]] ] +; CHECK-NEXT: [[PHI28:%.*]] = phi ptr [ [[CALL1]], [[BB17]] ], [ [[CALL1]], [[BB22]] ] +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: br label [[BB29]] +; CHECK: bb29: +; CHECK-NEXT: [[PHI30:%.*]] = phi ptr [ [[CALL1]], [[BB9]] ], [ [[CALL1]], [[BB25]] ] +; CHECK-NEXT: [[PHI31:%.*]] = phi ptr [ [[CALL1]], [[BB9]] ], [ [[CALL1]], [[BB25]] ] +; CHECK-NEXT: [[LOAD32:%.*]] = load i8, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[LOAD33:%.*]] = load i8, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR34:%.*]] = getelementptr i8, ptr [[PHI31]], i64 12 +; CHECK-NEXT: [[GETELEMENTPTR35:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 12 +; CHECK-NEXT: br label [[BB86:%.*]] +; CHECK: bb36: +; CHECK-NEXT: [[GETELEMENTPTR37:%.*]] = getelementptr i8, ptr [[PHI30]], i64 12 +; CHECK-NEXT: br label [[BB38:%.*]] +; CHECK: bb38: +; CHECK-NEXT: [[GETELEMENTPTR39:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR34]], i64 0, i64 0 +; CHECK-NEXT: [[LOAD40:%.*]] = load i32, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR41:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR37]], i64 0, i64 0 +; CHECK-NEXT: [[LOAD42:%.*]] = load i32, ptr [[CALL1]], align 4 +; CHECK-NEXT: br label [[BB38]] +; CHECK: bb43: +; CHECK-NEXT: [[GETELEMENTPTR44:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 8 +; CHECK-NEXT: [[LOAD45:%.*]] = load i16, ptr [[CALL1]], align 4 +; CHECK-NEXT: store i16 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: store i8 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR46:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 12 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR47:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 16 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: br label [[BB9]] +; CHECK: bb48: +; CHECK-NEXT: [[GETELEMENTPTR49:%.*]] = getelementptr i8, ptr [[CALL1]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR50:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 1 +; CHECK-NEXT: [[GETELEMENTPTR51:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 8 +; CHECK-NEXT: [[GETELEMENTPTR52:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 12 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: br label [[BB48]] +; CHECK: bb53: +; CHECK-NEXT: [[PHI54:%.*]] = phi ptr [ [[CALL1]], [[BB8:%.*]] ], [ [[A]], [[BB71:%.*]] ] +; CHECK-NEXT: [[GETELEMENTPTR55:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR56:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR57:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR58:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 1 +; CHECK-NEXT: br label [[BB71]] +; CHECK: bb59: +; CHECK-NEXT: [[GETELEMENTPTR60:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0 +; CHECK-NEXT: [[GETELEMENTPTR61:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR60]], i64 12 +; CHECK-NEXT: br label [[BB67:%.*]] +; CHECK: bb62: +; CHECK-NEXT: [[GETELEMENTPTR63:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR56]], i64 8 +; CHECK-NEXT: [[LOAD64:%.*]] = load i16, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR65:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 8 +; CHECK-NEXT: store i16 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR66:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0 +; CHECK-NEXT: br label [[BB67]] +; CHECK: bb67: +; CHECK-NEXT: [[PHI68:%.*]] = phi ptr [ [[A]], [[BB62:%.*]] ], [ [[CALL1]], [[BB59:%.*]] ] +; CHECK-NEXT: [[PHI69:%.*]] = phi ptr [ [[CALL1]], [[BB62]] ], [ [[CALL1]], [[BB59]] ] +; CHECK-NEXT: [[PHI70:%.*]] = phi ptr [ [[CALL1]], [[BB62]] ], [ [[CALL1]], [[BB59]] ] +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: br label [[BB71]] +; CHECK: bb71: +; CHECK-NEXT: [[PHI72:%.*]] = phi ptr [ [[CALL1]], [[BB53]] ], [ [[CALL1]], [[BB67]] ] +; CHECK-NEXT: [[PHI73:%.*]] = phi ptr [ [[CALL1]], [[BB53]] ], [ [[CALL1]], [[BB67]] ] +; CHECK-NEXT: [[LOAD74:%.*]] = load i8, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[LOAD75:%.*]] = load i8, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR76:%.*]] = getelementptr i8, ptr [[PHI72]], i64 12 +; CHECK-NEXT: [[GETELEMENTPTR77:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 12 +; CHECK-NEXT: [[GETELEMENTPTR78:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR76]], i64 0, i64 0 +; CHECK-NEXT: [[LOAD79:%.*]] = load i32, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR80:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR77]], i64 0, i64 0 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[LOAD81:%.*]] = load i8, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR82:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 8 +; CHECK-NEXT: [[LOAD83:%.*]] = load i16, ptr [[CALL1]], align 4 +; CHECK-NEXT: store i16 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: store i8 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR84:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 12 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR85:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 16 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: br label [[BB53]] +; CHECK: bb86: +; CHECK-NEXT: [[GETELEMENTPTR87:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR34]], i64 0, i64 0 +; CHECK-NEXT: [[LOAD88:%.*]] = load i32, ptr [[CALL1]], align 4 +; CHECK-NEXT: [[GETELEMENTPTR89:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR35]], i64 0, i64 0 +; CHECK-NEXT: store i32 0, ptr [[CALL1]], align 4 +; CHECK-NEXT: br label [[BB86]] +; +bb: + %call = tail call ptr @malloc(i64 1) + tail call void @llvm.memset.p0.i64(ptr %call, i8 0, i64 1, i1 false) + %call1 = tail call noalias ptr @malloc(i64 0) + store ptr %call1, ptr %a, align 8 + %load = load ptr, ptr %a, align 8 + %load2 = load i32, ptr %load, align 8 + %getelementptr = getelementptr i8, ptr %call1, i64 0 + %getelementptr3 = getelementptr i8, ptr %getelementptr, i64 1 + store i8 0, ptr %call1, align 1 + %getelementptr4 = getelementptr i8, ptr %getelementptr, i64 8 + store i16 0, ptr %getelementptr4, align 4 + %getelementptr5 = getelementptr i8, ptr %getelementptr, i64 12 + store i32 0, ptr %call1, align 4 + %load6 = load i32, ptr inttoptr (i64 4 to ptr), align 4 + br label %bb48 + +bb7: ; No predecessors! + br label %bb9 + +bb8: ; No predecessors! + br label %bb53 + +bb9: ; preds = %bb43, %bb7 + %phi = phi ptr [ %call1, %bb7 ], [ %a, %bb43 ] + %getelementptr10 = getelementptr i8, ptr %phi, i64 0 + %getelementptr11 = getelementptr i8, ptr %phi, i64 0 + %getelementptr12 = getelementptr i8, ptr %phi, i64 0 + %getelementptr13 = getelementptr i8, ptr %getelementptr12, i64 1 + store i8 0, ptr %call1, align 1 + br label %bb29 + +bb14: ; No predecessors! + %getelementptr15 = getelementptr i8, ptr %getelementptr10, i64 8 + %load16 = load i16, ptr %call1, align 4 + br i1 false, label %bb22, label %bb17 + +bb17: ; preds = %bb14 + %getelementptr18 = getelementptr i8, ptr %getelementptr11, i64 8 + %load19 = load i16, ptr %call1, align 4 + %getelementptr20 = getelementptr i8, ptr %getelementptr12, i64 8 + store i16 0, ptr %call1, align 4 + %getelementptr21 = getelementptr i8, ptr %phi, i64 0 + br label %bb25 + +bb22: ; preds = %bb14 + %getelementptr23 = getelementptr i8, ptr %phi, i64 0 + %getelementptr24 = getelementptr i8, ptr %getelementptr23, i64 12 + br label %bb25 + +bb25: ; preds = %bb22, %bb17 + %phi26 = phi ptr [ %a, %bb17 ], [ %call1, %bb22 ] + %phi27 = phi ptr [ %call1, %bb17 ], [ %call1, %bb22 ] + %phi28 = phi ptr [ %call1, %bb17 ], [ %call1, %bb22 ] + store i32 0, ptr %call1, align 4 + br label %bb29 + +bb29: ; preds = %bb25, %bb9 + %phi30 = phi ptr [ %call1, %bb9 ], [ %call1, %bb25 ] + %phi31 = phi ptr [ %call1, %bb9 ], [ %call1, %bb25 ] + %load32 = load i8, ptr %call1, align 4 + %load33 = load i8, ptr %call1, align 4 + %getelementptr34 = getelementptr i8, ptr %phi31, i64 12 + %getelementptr35 = getelementptr i8, ptr %getelementptr12, i64 12 + br label %bb86 + +bb36: ; No predecessors! + %getelementptr37 = getelementptr i8, ptr %phi30, i64 12 + br label %bb38 + +bb38: ; preds = %bb38, %bb36 + %getelementptr39 = getelementptr [0 x i32], ptr %getelementptr34, i64 0, i64 0 + %load40 = load i32, ptr %call1, align 4 + %getelementptr41 = getelementptr [0 x i32], ptr %getelementptr37, i64 0, i64 0 + %load42 = load i32, ptr %call1, align 4 + br label %bb38 + +bb43: ; No predecessors! + %getelementptr44 = getelementptr i8, ptr %getelementptr12, i64 8 + %load45 = load i16, ptr %call1, align 4 + store i16 0, ptr %call1, align 4 + store i8 0, ptr %call1, align 4 + %getelementptr46 = getelementptr i8, ptr %getelementptr12, i64 12 + store i32 0, ptr %call1, align 4 + %getelementptr47 = getelementptr i8, ptr %getelementptr12, i64 16 + store i32 0, ptr %call1, align 4 + br label %bb9 + +bb48: ; preds = %bb48, %bb + %getelementptr49 = getelementptr i8, ptr %call1, i64 0 + %getelementptr50 = getelementptr i8, ptr %getelementptr49, i64 1 + store i8 0, ptr %call1, align 1 + %getelementptr51 = getelementptr i8, ptr %getelementptr49, i64 8 + store i16 0, ptr %call1, align 4 + %getelementptr52 = getelementptr i8, ptr %getelementptr49, i64 12 + store i32 0, ptr %call1, align 4 + br label %bb48 + +bb53: ; preds = %bb71, %bb8 + %phi54 = phi ptr [ %call1, %bb8 ], [ %a, %bb71 ] + %getelementptr55 = getelementptr i8, ptr %phi54, i64 0 + %getelementptr56 = getelementptr i8, ptr %phi54, i64 0 + %getelementptr57 = getelementptr i8, ptr %phi54, i64 0 + %getelementptr58 = getelementptr i8, ptr %getelementptr57, i64 1 + br label %bb71 + +bb59: ; No predecessors! + %getelementptr60 = getelementptr i8, ptr %phi54, i64 0 + %getelementptr61 = getelementptr i8, ptr %getelementptr60, i64 12 + br label %bb67 + +bb62: ; No predecessors! + %getelementptr63 = getelementptr i8, ptr %getelementptr56, i64 8 + %load64 = load i16, ptr %call1, align 4 + %getelementptr65 = getelementptr i8, ptr %getelementptr57, i64 8 + store i16 0, ptr %call1, align 4 + %getelementptr66 = getelementptr i8, ptr %phi54, i64 0 + br label %bb67 + +bb67: ; preds = %bb62, %bb59 + %phi68 = phi ptr [ %a, %bb62 ], [ %call1, %bb59 ] + %phi69 = phi ptr [ %call1, %bb62 ], [ %call1, %bb59 ] + %phi70 = phi ptr [ %call1, %bb62 ], [ %call1, %bb59 ] + store i32 0, ptr %call1, align 4 + br label %bb71 + +bb71: ; preds = %bb67, %bb53 + %phi72 = phi ptr [ %call1, %bb53 ], [ %call1, %bb67 ] + %phi73 = phi ptr [ %call1, %bb53 ], [ %call1, %bb67 ] + %load74 = load i8, ptr %call1, align 4 + %load75 = load i8, ptr %call1, align 4 + %getelementptr76 = getelementptr i8, ptr %phi72, i64 12 + %getelementptr77 = getelementptr i8, ptr %getelementptr57, i64 12 + %getelementptr78 = getelementptr [0 x i32], ptr %getelementptr76, i64 0, i64 0 + %load79 = load i32, ptr %call1, align 4 + %getelementptr80 = getelementptr [0 x i32], ptr %getelementptr77, i64 0, i64 0 + store i32 0, ptr %call1, align 4 + %load81 = load i8, ptr %call1, align 4 + %getelementptr82 = getelementptr i8, ptr %getelementptr57, i64 8 + %load83 = load i16, ptr %call1, align 4 + store i16 0, ptr %call1, align 4 + store i8 0, ptr %call1, align 4 + %getelementptr84 = getelementptr i8, ptr %getelementptr57, i64 12 + store i32 0, ptr %call1, align 4 + %getelementptr85 = getelementptr i8, ptr %getelementptr57, i64 16 + store i32 0, ptr %call1, align 4 + br label %bb53 + +bb86: ; preds = %bb86, %bb29 + %getelementptr87 = getelementptr [0 x i32], ptr %getelementptr34, i64 0, i64 0 + %load88 = load i32, ptr %call1, align 4 + %getelementptr89 = getelementptr [0 x i32], ptr %getelementptr35, i64 0, i64 0 + store i32 0, ptr %call1, align 4 + br label %bb86 +} + +declare ptr @malloc(i64) + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #0 + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/Transforms/Float2Int/pr79158.ll b/llvm/test/Transforms/Float2Int/pr79158.ll new file mode 100644 index 00000000000000..5e78cc0bc66fdb --- /dev/null +++ b/llvm/test/Transforms/Float2Int/pr79158.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=float2int -S | FileCheck %s + +define i32 @pr79158(i32 %x) { +; CHECK-LABEL: define i32 @pr79158( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[CMP]] to i64 +; CHECK-NEXT: [[MUL1:%.*]] = mul i64 [[TMP0]], 4294967295 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[MUL1]] to i32 +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + %cmp = icmp sgt i32 %x, 0 + %conv = uitofp i1 %cmp to double + %mul = fmul double %conv, 0x41EFFFFFFFE00000 + %conv1 = fptoui double %mul to i32 + ret i32 %conv1 +} diff --git a/llvm/test/Transforms/GVN/pr82884.ll b/llvm/test/Transforms/GVN/pr82884.ll new file mode 100644 index 00000000000000..71abafda60d93d --- /dev/null +++ b/llvm/test/Transforms/GVN/pr82884.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=gvn < %s | FileCheck %s + +; Make sure nsw/nuw flags are dropped. + +define i32 @pr82884(i32 %x) { +; CHECK-LABEL: define i32 @pr82884( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[X]], [[X]] +; CHECK-NEXT: call void @use(i32 [[MUL]]) +; CHECK-NEXT: [[MUL2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[X]], i32 [[X]]) +; CHECK-NEXT: ret i32 [[MUL]] +; + %mul = mul nsw nuw i32 %x, %x + call void @use(i32 %mul) + %mul2 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %x, i32 %x) + %ret = extractvalue { i32, i1 } %mul2, 0 + ret i32 %ret +} + +declare void @use(i32) diff --git a/llvm/test/Transforms/IRCE/pr89959.ll b/llvm/test/Transforms/IRCE/pr89959.ll new file mode 100644 index 00000000000000..dc7c0dfbc57a97 --- /dev/null +++ b/llvm/test/Transforms/IRCE/pr89959.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=irce -S < %s 2>&1 | FileCheck %s + +; Make sure we don't crash. +define void @pr89959() { +; CHECK-LABEL: define void @pr89959() { +; CHECK-NEXT: top: +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: L3: +; CHECK-NEXT: [[VALUE_PHI:%.*]] = phi ptr [ null, [[TOP:%.*]] ], [ [[TMP0:%.*]], [[L13:%.*]] ] +; CHECK-NEXT: [[TMP0]] = getelementptr i8, ptr [[VALUE_PHI]], i64 8 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp ule ptr [[VALUE_PHI]], null +; CHECK-NEXT: br i1 [[DOTNOT]], label [[L13]], label [[L15:%.*]] +; CHECK: L13: +; CHECK-NEXT: br label [[L3]] +; CHECK: L15: +; CHECK-NEXT: ret void +; +top: + br label %L3 + +L3: + %value_phi = phi ptr [ null, %top ], [ %0, %L13 ] + %0 = getelementptr i8, ptr %value_phi, i64 8 + %.not = icmp ule ptr %value_phi, null + br i1 %.not, label %L13, label %L15 + +L13: + br label %L3 + +L15: + ret void +} diff --git a/llvm/test/Transforms/IndVarSimplify/pr55925.ll b/llvm/test/Transforms/IndVarSimplify/pr55925.ll index 420fc209949d4f..312a8295ccdc9f 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr55925.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr55925.ll @@ -18,9 +18,9 @@ define void @test(ptr %p) personality ptr undef { ; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foo(i32 returned [[TMP0]]) ; CHECK-NEXT: to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]] ; CHECK: loop.latch: -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @foo(i32 [[TMP1]]) +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: br label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } @@ -64,8 +64,8 @@ define void @test_critedge(i1 %c, ptr %p) personality ptr undef { ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP1]], [[LOOP_INVOKE]] ], [ 0, [[LOOP_OTHER]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @foo(i32 [[PHI]]) +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: br label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } diff --git a/llvm/test/Transforms/IndVarSimplify/pr79861.ll b/llvm/test/Transforms/IndVarSimplify/pr79861.ll new file mode 100644 index 00000000000000..66250944961397 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/pr79861.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=indvars < %s | FileCheck %s + +target datalayout = "n64" + +declare void @use(i64) + +define void @or_disjoint() { +; CHECK-LABEL: define void @or_disjoint() { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 2, [[ENTRY:%.*]] ], [ [[IV_DEC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[SEL:%.*]] = select i1 false, i64 [[OR]], i64 [[ADD]] +; CHECK-NEXT: call void @use(i64 [[SEL]]) +; CHECK-NEXT: [[IV_DEC]] = add nsw i64 [[IV]], -1 +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_DEC]], 0 +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 2, %entry ], [ %iv.dec, %loop ] + %or = or disjoint i64 %iv, 1 + %add = add nsw i64 %iv, 1 + %sel = select i1 false, i64 %or, i64 %add + call void @use(i64 %sel) + + %iv.dec = add nsw i64 %iv, -1 + %exit.cond = icmp eq i64 %iv.dec, 0 + br i1 %exit.cond, label %exit, label %loop + +exit: + ret void +} + +define void @add_nowrap_flags(i64 %n) { +; CHECK-LABEL: define void @add_nowrap_flags( +; CHECK-SAME: i64 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[IV]], 123 +; CHECK-NEXT: call void @use(i64 [[ADD1]]) +; CHECK-NEXT: [[IV_INC]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_INC]], [[N]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.inc, %loop ] + %add1 = add nuw nsw i64 %iv, 123 + %add2 = add i64 %iv, 123 + %sel = select i1 false, i64 %add1, i64 %add2 + call void @use(i64 %sel) + + %iv.inc = add i64 %iv, 1 + %exit.cond = icmp eq i64 %iv.inc, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + ret void +} + + +define void @expander_or_disjoint(i64 %n) { +; CHECK-LABEL: define void @expander_or_disjoint( +; CHECK-SAME: i64 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[N]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_INC]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV]], [[OR]] +; CHECK-NEXT: call void @use(i64 [[ADD]]) +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_INC]], [[TMP0]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %or = or disjoint i64 %n, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.inc, %loop ] + %iv.inc = add i64 %iv, 1 + %add = add i64 %iv, %or + call void @use(i64 %add) + %cmp = icmp ult i64 %iv, %n + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll index d6b1f3ef45e765..7723e6c664c3d5 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll @@ -1,71 +1,70 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline | FileCheck %s -declare void @inlined_body() "aarch64_pstate_sm_compatible"; +declare i32 @llvm.vscale.i32() -; Define some functions that will be called by the functions below. -; These just call a '...body()' function. If we see the call to one of -; these functions being replaced by '...body()', then we know it has been -; inlined. +; Define some functions that merely call llvm.vscale.i32(), which will be called +; by the other functions below. If we see the call to one of these functions +; being replaced by 'llvm.vscale()', then we know it has been inlined. -define void @normal_callee() { -; CHECK-LABEL: define void @normal_callee +define i32 @normal_callee() { +; CHECK-LABEL: define i32 @normal_callee ; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @streaming_callee() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_callee +define i32 @streaming_callee() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_callee ; CHECK-SAME: () #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @locally_streaming_callee() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_callee +define i32 @locally_streaming_callee() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_callee ; CHECK-SAME: () #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_callee +define i32 @streaming_compatible_callee() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_callee ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_callee +define i32 @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_callee ; CHECK-SAME: () #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale() + ret i32 %res } ; Now test that inlining only happens when their streaming modes match. @@ -85,16 +84,16 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_normal_callee_inline() { -; CHECK-LABEL: define void @normal_caller_normal_callee_inline +define i32 @normal_caller_normal_callee_inline() { +; CHECK-LABEL: define i32 @normal_caller_normal_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] N -> N @@ -102,16 +101,16 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_streaming_callee_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_callee_inline +define i32 @normal_caller_streaming_callee_dont_inline() { +; CHECK-LABEL: define i32 @normal_caller_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] N -> N @@ -119,16 +118,16 @@ entry: ; [x] N -> SC ; [ ] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_streaming_compatible_callee_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_compatible_callee_inline +define i32 @normal_caller_streaming_compatible_callee_inline() { +; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] N -> N @@ -136,16 +135,16 @@ entry: ; [ ] N -> SC ; [x] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_locally_streaming_callee_inline() { -; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_inline +define i32 @normal_caller_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: define i32 @normal_caller_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] N -> N @@ -153,16 +152,16 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [x] N -> SC + B -define void @normal_caller_streaming_compatible_locally_streaming_callee_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] S -> N @@ -170,16 +169,16 @@ entry: ; [ ] S -> SC ; [ ] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_normal_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_normal_callee_inline +define i32 @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] S -> N @@ -187,16 +186,16 @@ entry: ; [ ] S -> SC ; [ ] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_streaming_callee_inline +define i32 @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] S -> N @@ -204,16 +203,16 @@ entry: ; [x] S -> SC ; [ ] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_streaming_compatible_callee_inline +define i32 @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] S -> N @@ -221,16 +220,16 @@ entry: ; [ ] S -> SC ; [x] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_locally_streaming_callee_inline +define i32 @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] S -> N @@ -238,16 +237,16 @@ entry: ; [ ] S -> SC ; [ ] S -> N + B ; [x] S -> SC + B -define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] N + B -> N @@ -255,16 +254,16 @@ entry: ; [ ] N + B -> SC ; [ ] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_inline +define i32 @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] N + B -> N @@ -272,16 +271,16 @@ entry: ; [ ] N + B -> SC ; [ ] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_streaming_callee_inline +define i32 @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] N + B -> N @@ -289,16 +288,16 @@ entry: ; [x] N + B -> SC ; [ ] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_callee_inline +define i32 @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] N + B -> N @@ -306,16 +305,16 @@ entry: ; [ ] N + B -> SC ; [x] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_locally_streaming_callee_inline +define i32 @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] N + B -> N @@ -323,16 +322,16 @@ entry: ; [ ] N + B -> SC ; [ ] N + B -> N + B ; [x] N + B -> SC + B -define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] SC -> N @@ -340,16 +339,16 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_inline +define i32 @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] SC -> N @@ -357,16 +356,16 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_inline +define i32 @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] SC -> N @@ -374,16 +373,16 @@ entry: ; [x] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_callee_inline +define i32 @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] SC -> N @@ -391,16 +390,16 @@ entry: ; [ ] SC -> SC ; [x] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_inline +define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] SC -> N @@ -408,32 +407,32 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [x] SC -> SC + B -define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] SC + B -> N ; [ ] SC + B -> S ; [ ] SC + B -> SC ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -441,16 +440,16 @@ entry: ; [ ] SC + B -> SC ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -458,16 +457,16 @@ entry: ; [x] SC + B -> SC ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -475,16 +474,16 @@ entry: ; [ ] SC + B -> SC ; [x] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -492,16 +491,16 @@ entry: ; [ ] SC + B -> SC ; [ ] SC + B -> N + B ; [x] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_and_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } define void @normal_callee_with_inlineasm() { diff --git a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll index 3a30980fe31bd7..6f582cab2f1452 100644 --- a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll +++ b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll @@ -93,3 +93,29 @@ define internal void @caller_not_avx4() { } declare i64 @caller_unknown_simple(i64) + +; This call should get inlined, because the callee only contains +; inline ASM, not real calls. +define <8 x i64> @caller_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #0 { +; CHECK-LABEL: define {{[^@]+}}@caller_inline_asm +; CHECK-SAME: (ptr [[P0:%.*]], i64 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[SRC_I:%.*]] = load <8 x i64>, ptr [[P0]], align 64 +; CHECK-NEXT: [[A_I:%.*]] = load <8 x i64>, ptr [[P1]], align 64 +; CHECK-NEXT: [[B_I:%.*]] = load <8 x i64>, ptr [[P2]], align 64 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 [[K]], <8 x i64> [[A_I]], <8 x i64> [[B_I]], <8 x i64> [[SRC_I]]) +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %call = call <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) + ret <8 x i64> %call +} + +define internal <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #1 { + %src = load <8 x i64>, ptr %p0, align 64 + %a = load <8 x i64>, ptr %p1, align 64 + %b = load <8 x i64>, ptr %p2, align 64 + %3 = tail call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 %k, <8 x i64> %a, <8 x i64> %b, <8 x i64> %src) #2 + ret <8 x i64> %3 +} + +attributes #0 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" } +attributes #1 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" } diff --git a/llvm/test/Transforms/Inline/inline-sign-return-address.ll b/llvm/test/Transforms/Inline/inline-sign-return-address.ll new file mode 100644 index 00000000000000..c4d85fa671a4f6 --- /dev/null +++ b/llvm/test/Transforms/Inline/inline-sign-return-address.ll @@ -0,0 +1,104 @@ +; Check the inliner doesn't inline a function with different sign return address schemes. +; RUN: opt < %s -passes=inline -S | FileCheck %s + +define internal void @foo_all() #0 { + ret void +} + +define internal void @foo_nonleaf() #1 { + ret void +} + +define internal void @foo_none() #2 { + ret void +} + +define internal void @foo_lr() #3 { + ret void +} + +define internal void @foo_bkey() #4 { + ret void +} + +define dso_local void @bar_all() #0 { +; CHECK-LABEL: bar_all +; CHECK-NOT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_nonleaf() #1 { +; CHECK-LABEL: bar_nonleaf +; CHECK-NEXT: call void @foo_all() +; CHECK-NOT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_none() #2 { +; CHECK-LABEL: bar_none +; CHECK-NEXT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NOT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_lr() #3 { +; CHECK-LABEL: bar_lr +; CHECK-NEXT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NOT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_bkey() #4 { +; CHECK-LABEL: bar_bkey +; CHECK-NEXT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NOT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + + +attributes #0 = { "branch-protection-pauth-lr"="false" "sign-return-address"="all" } +attributes #1 = { "branch-protection-pauth-lr"="false" "sign-return-address"="non-leaf" } +attributes #2 = { "branch-protection-pauth-lr"="false" "sign-return-address"="none" } +attributes #3 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" } +attributes #4 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" } \ No newline at end of file diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll index 58bd81297b0dd9..5ace1039c37825 100644 --- a/llvm/test/Transforms/InstCombine/bitcast.ll +++ b/llvm/test/Transforms/InstCombine/bitcast.ll @@ -686,6 +686,21 @@ define ptr @bitcast_from_single_element_pointer_vector_to_pointer(<1 x ptr> %ptr ret ptr %ptr } +; Sure that we calculate the correct shift. +define <4 x i32> @bitcast_shl(i32 %arg) { +; CHECK-LABEL: @bitcast_shl( +; CHECK-NEXT: [[I5:%.*]] = insertelement <4 x i32> , i32 [[ARG:%.*]], i64 3 +; CHECK-NEXT: ret <4 x i32> [[I5]] +; + %i = zext i32 %arg to i64 + %i1 = shl i64 %i, 32 + %i2 = or i64 %i1, 65 + %i3 = zext i64 %i2 to i128 + %i4 = shl i128 %i3, 64 + %i5 = bitcast i128 %i4 to <4 x i32> + ret <4 x i32> %i5 +} + declare void @f1() declare void @f2() define ptr @select_bitcast_unsized_pointer(i1 %c) { diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll index 642c3eb2a0e41b..c90b6c9fb29592 100644 --- a/llvm/test/Transforms/InstCombine/getelementptr.ll +++ b/llvm/test/Transforms/InstCombine/getelementptr.ll @@ -116,6 +116,7 @@ define void @test_overaligned_vec(i8 %B) { ; CHECK-LABEL: @test_overaligned_vec( ; CHECK-NEXT: store i8 [[B:%.*]], ptr getelementptr inbounds ([10 x i8], ptr @Global, i64 0, i64 2), align 1 ; CHECK-NEXT: ret void +; %A = getelementptr <2 x half>, ptr @Global, i64 0, i64 1 store i8 %B, ptr %A ret void @@ -1473,6 +1474,16 @@ define ptr @gep_sdiv(ptr %p, i64 %off) { ret ptr %ptr } +define ptr @gep_udiv(ptr %p, i64 %off) { +; CHECK-LABEL: @gep_udiv( +; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFF:%.*]] +; CHECK-NEXT: ret ptr [[PTR]] +; + %index = udiv exact i64 %off, 7 + %ptr = getelementptr %struct.C, ptr %p, i64 %index + ret ptr %ptr +} + define <2 x ptr> @gep_sdiv_vec(<2 x ptr> %p, <2 x i64> %off) { ; CHECK-LABEL: @gep_sdiv_vec( ; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, <2 x ptr> [[P:%.*]], <2 x i64> [[OFF:%.*]] @@ -1503,6 +1514,16 @@ define ptr @gep_ashr(ptr %p, i64 %off) { ret ptr %ptr } +define ptr @gep_lshr(ptr %p, i64 %off) { +; CHECK-LABEL: @gep_lshr( +; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFF:%.*]] +; CHECK-NEXT: ret ptr [[PTR]] +; + %index = lshr exact i64 %off, 2 + %ptr = getelementptr i32, ptr %p, i64 %index + ret ptr %ptr +} + ; Negative tests define ptr @gep_i8(ptr %p, i64 %off) { @@ -1525,6 +1546,17 @@ define ptr @gep_sdiv_mismatched_size(ptr %p, i64 %off) { ret ptr %ptr } +define ptr @gep_udiv_mismatched_size(ptr %p, i64 %off) { +; CHECK-LABEL: @gep_udiv_mismatched_size( +; CHECK-NEXT: [[INDEX:%.*]] = udiv exact i64 [[OFF:%.*]], 20 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[P:%.*]], i64 [[INDEX]] +; CHECK-NEXT: ret ptr [[PTR]] +; + %index = udiv exact i64 %off, 20 + %ptr = getelementptr %struct.C, ptr %p, i64 %index + ret ptr %ptr +} + define ptr @gep_sdiv_without_exact(ptr %p, i64 %off) { ; CHECK-LABEL: @gep_sdiv_without_exact( ; CHECK-NEXT: [[INDEX:%.*]] = sdiv i64 [[OFF:%.*]], 7 @@ -1536,6 +1568,17 @@ define ptr @gep_sdiv_without_exact(ptr %p, i64 %off) { ret ptr %ptr } +define ptr @gep_udiv_without_exact(ptr %p, i64 %off) { +; CHECK-LABEL: @gep_udiv_without_exact( +; CHECK-NEXT: [[INDEX:%.*]] = udiv i64 [[OFF:%.*]], 7 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[P:%.*]], i64 [[INDEX]] +; CHECK-NEXT: ret ptr [[PTR]] +; + %index = udiv i64 %off, 7 + %ptr = getelementptr %struct.C, ptr %p, i64 %index + ret ptr %ptr +} + define ptr @gep_ashr_without_exact(ptr %p, i64 %off) { ; CHECK-LABEL: @gep_ashr_without_exact( ; CHECK-NEXT: [[INDEX:%.*]] = ashr i64 [[OFF:%.*]], 2 @@ -1547,6 +1590,17 @@ define ptr @gep_ashr_without_exact(ptr %p, i64 %off) { ret ptr %ptr } +define ptr @gep_lshr_without_exact(ptr %p, i64 %off) { +; CHECK-LABEL: @gep_lshr_without_exact( +; CHECK-NEXT: [[INDEX:%.*]] = lshr i64 [[OFF:%.*]], 2 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]] +; CHECK-NEXT: ret ptr [[PTR]] +; + %index = lshr i64 %off, 2 + %ptr = getelementptr i32, ptr %p, i64 %index + ret ptr %ptr +} + define i1 @test_only_used_by_icmp(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test_only_used_by_icmp( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[B:%.*]], [[C:%.*]] diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll index a203b28bcb82a8..f37226bbd5b09c 100644 --- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll +++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll @@ -240,3 +240,43 @@ define i32 @vec_to_scalar_select_vector(<2 x i1> %b) { %c = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %s) ret i32 %c } + +define i8 @test_drop_noundef(i1 %cond, i8 %val) { +; CHECK-LABEL: @test_drop_noundef( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.smin.i8(i8 [[VAL:%.*]], i8 0) +; CHECK-NEXT: [[RET:%.*]] = select i1 [[COND:%.*]], i8 -1, i8 [[TMP0]] +; CHECK-NEXT: ret i8 [[RET]] +; +entry: + %sel = select i1 %cond, i8 -1, i8 %val + %ret = call noundef i8 @llvm.smin.i8(i8 %sel, i8 0) + ret i8 %ret +} + +define i1 @pr85536(i32 %a) { +; CHECK-LABEL: @pr85536( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[A:%.*]], 31 +; CHECK-NEXT: [[SHL1:%.*]] = shl nsw i32 -1, [[A]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SHL1]] to i64 +; CHECK-NEXT: [[SHL2:%.*]] = shl i64 [[ZEXT]], 48 +; CHECK-NEXT: [[SHR:%.*]] = ashr exact i64 [[SHL2]], 48 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.smin.i64(i64 [[SHR]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 65535 +; CHECK-NEXT: [[RET1:%.*]] = icmp eq i64 [[TMP1]], 0 +; CHECK-NEXT: [[RET:%.*]] = select i1 [[CMP1]], i1 [[RET1]], i1 false +; CHECK-NEXT: ret i1 [[RET]] +; +entry: + %cmp1 = icmp ugt i32 %a, 30 + %shl1 = shl nsw i32 -1, %a + %zext = zext i32 %shl1 to i64 + %shl2 = shl i64 %zext, 48 + %shr = ashr exact i64 %shl2, 48 + %sel = select i1 %cmp1, i64 -1, i64 %shr + %smin = call noundef i64 @llvm.smin.i64(i64 %sel, i64 0) + %masked = and i64 %smin, 65535 + %ret = icmp eq i64 %masked, 0 + ret i1 %ret +} diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll index 2704905f7a358d..c87c1199f727ea 100644 --- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll @@ -292,7 +292,11 @@ entry: define void @scatter_nxv4i16_uniform_vals_uniform_ptrs_all_active_mask(ptr %dst, i16 %val) { ; CHECK-LABEL: @scatter_nxv4i16_uniform_vals_uniform_ptrs_all_active_mask( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i16 [[VAL:%.*]], ptr [[DST:%.*]], align 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_VALUE:%.*]] = insertelement poison, i16 [[VAL:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATVALUE:%.*]] = shufflevector [[BROADCAST_VALUE]], poison, zeroinitializer +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[BROADCAST_SPLATVALUE]], [[BROADCAST_SPLAT]], i32 2, shufflevector ( insertelement ( zeroinitializer, i1 true, i32 0), zeroinitializer, zeroinitializer)) ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/InstCombine/pr80597.ll b/llvm/test/Transforms/InstCombine/pr80597.ll new file mode 100644 index 00000000000000..5feae4a06c45c0 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/pr80597.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +define i64 @pr80597(i1 %cond) { +; CHECK-LABEL: define i64 @pr80597( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD:%.*]] = select i1 [[COND]], i64 0, i64 -12884901888 +; CHECK-NEXT: [[SEXT1:%.*]] = add nsw i64 [[ADD]], 8836839514384105472 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[SEXT1]], -34359738368 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[SEXT2:%.*]] = ashr exact i64 [[ADD]], 1 +; CHECK-NEXT: [[ASHR:%.*]] = or i64 [[SEXT2]], 4418419761487020032 +; CHECK-NEXT: ret i64 [[ASHR]] +; CHECK: if.then: +; CHECK-NEXT: ret i64 0 +; +entry: + %add = select i1 %cond, i64 0, i64 4294967293 + %add8 = shl i64 %add, 32 + %sext1 = add i64 %add8, 8836839514384105472 + %cmp = icmp ult i64 %sext1, -34359738368 + br i1 %cmp, label %if.then, label %if.else + +if.else: + %sext2 = or i64 %add8, 8836839522974040064 + %ashr = ashr i64 %sext2, 1 + ret i64 %ashr + +if.then: + ret i64 0 +} diff --git a/llvm/test/Transforms/InstCombine/pr80941.ll b/llvm/test/Transforms/InstCombine/pr80941.ll new file mode 100644 index 00000000000000..95242b1d1407bf --- /dev/null +++ b/llvm/test/Transforms/InstCombine/pr80941.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +define float @pr80941(float %arg) { +; CHECK-LABEL: define float @pr80941( +; CHECK-SAME: float [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 144) +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_EXIT:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[FPEXT:%.*]] = fpext float [[ARG]] to double +; CHECK-NEXT: [[SIGN:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[FPEXT]]) +; CHECK-NEXT: [[FPTRUNC:%.*]] = fptrunc double [[SIGN]] to float +; CHECK-NEXT: br label [[IF_EXIT]] +; CHECK: if.exit: +; CHECK-NEXT: [[RET:%.*]] = phi float [ [[FPTRUNC]], [[IF_THEN]] ], [ [[ARG]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret float [[RET]] +; +entry: + %cond = tail call i1 @llvm.is.fpclass.f32(float %arg, i32 144) + br i1 %cond, label %if.then, label %if.exit + +if.then: + %fpext = fpext float %arg to double + %sign = call double @llvm.copysign.f64(double 0.000000e+00, double %fpext) + %fptrunc = fptrunc double %sign to float + br label %if.exit + +if.exit: + %ret = phi float [ %fptrunc, %if.then ], [ %arg, %entry ] + ret float %ret +} diff --git a/llvm/test/Transforms/InstCombine/pr83931.ll b/llvm/test/Transforms/InstCombine/pr83931.ll new file mode 100644 index 00000000000000..d36ac8d91abd30 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/pr83931.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +define @dont_crash( %x) { +; CHECK-LABEL: define @dont_crash( +; CHECK-SAME: [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RET:%.*]] = icmp sgt [[X]], shufflevector ( insertelement ( poison, i64 -309383, i64 0), poison, zeroinitializer) +; CHECK-NEXT: ret [[RET]] +; +entry: + %div = sdiv %x, splat (i64 309383) + %ret = icmp sge %div, zeroinitializer + ret %ret +} diff --git a/llvm/test/Transforms/InstCombine/pr83947.ll b/llvm/test/Transforms/InstCombine/pr83947.ll new file mode 100644 index 00000000000000..c1d601ff637183 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/pr83947.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +@c = global i32 0, align 4 +@b = global i32 0, align 4 + +define void @masked_scatter1() { +; CHECK-LABEL: define void @masked_scatter1() { +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( zeroinitializer, shufflevector ( insertelement ( poison, ptr @c, i64 0), poison, zeroinitializer), i32 4, shufflevector ( insertelement ( poison, i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c), i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret void +; + call void @llvm.masked.scatter.nxv4i32.nxv4p0( zeroinitializer, splat (ptr @c), i32 4, splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c))) + ret void +} + +define void @masked_scatter2() { +; CHECK-LABEL: define void @masked_scatter2() { +; CHECK-NEXT: store i32 0, ptr @c, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 true)) + ret void +} + +define void @masked_scatter3() { +; CHECK-LABEL: define void @masked_scatter3() { +; CHECK-NEXT: store i32 0, ptr @c, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> undef) + ret void +} + +define void @masked_scatter4() { +; CHECK-LABEL: define void @masked_scatter4() { +; CHECK-NEXT: ret void +; + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 false)) + ret void +} + +define void @masked_scatter5() { +; CHECK-LABEL: define void @masked_scatter5() { +; CHECK-NEXT: store i32 0, ptr @c, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> ) + ret void +} + +define void @masked_scatter6() { +; CHECK-LABEL: define void @masked_scatter6() { +; CHECK-NEXT: store i32 0, ptr @c, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> ) + ret void +} + +define void @masked_scatter7() { +; CHECK-LABEL: define void @masked_scatter7() { +; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> , i32 4, <2 x i1> ) +; CHECK-NEXT: ret void +; + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c))) + ret void +} diff --git a/llvm/test/Transforms/InstCombine/select-divrem.ll b/llvm/test/Transforms/InstCombine/select-divrem.ll index f007c53359ca5a..e0c460c37451db 100644 --- a/llvm/test/Transforms/InstCombine/select-divrem.ll +++ b/llvm/test/Transforms/InstCombine/select-divrem.ll @@ -343,3 +343,20 @@ define i32 @rem_euclid_pow2_false_arm_folded(i32 %n) { %res = select i1 %nonneg, i32 %rem, i32 1 ret i32 %res } + +define i8 @pr89516(i8 %n, i8 %x) { +; CHECK-LABEL: @pr89516( +; CHECK-NEXT: [[COND:%.*]] = icmp slt i8 [[X:%.*]], 0 +; CHECK-NEXT: [[POW2:%.*]] = shl nuw i8 1, [[N:%.*]] +; CHECK-NEXT: [[SREM:%.*]] = srem i8 1, [[POW2]] +; CHECK-NEXT: [[ADD:%.*]] = select i1 [[COND]], i8 [[POW2]], i8 0 +; CHECK-NEXT: [[RES:%.*]] = add nuw i8 [[SREM]], [[ADD]] +; CHECK-NEXT: ret i8 [[RES]] +; + %cond = icmp slt i8 %x, 0 + %pow2 = shl nuw i8 1, %n + %srem = srem i8 1, %pow2 + %add = add nuw i8 %srem, %pow2 + %res = select i1 %cond, i8 %add, i8 %srem + ret i8 %res +} diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index c5f1b77c6d7404..888e7d28f78afb 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -2849,12 +2849,14 @@ define i8 @select_replacement_sub(i8 %x, i8 %y, i8 %z) { ret i8 %sel } +; FIXME: This is safe to fold. define i8 @select_replacement_shift_noundef(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @select_replacement_shift_noundef( ; CHECK-NEXT: [[SHR:%.*]] = lshr exact i8 [[X:%.*]], 1 ; CHECK-NEXT: call void @use_i8(i8 noundef [[SHR]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[SHR]], [[Y:%.*]] -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[Z:%.*]] +; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[Y]], 1 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[SHL]], i8 [[Z:%.*]] ; CHECK-NEXT: ret i8 [[SEL]] ; %shr = lshr exact i8 %x, 1 @@ -2904,6 +2906,40 @@ define i32 @select_replacement_loop2(i32 %arg, i32 %arg2) { ret i32 %sel } +define i8 @select_replacement_loop3(i32 noundef %x) { +; CHECK-LABEL: @select_replacement_loop3( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[X:%.*]] to i8 +; CHECK-NEXT: [[REV:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[TRUNC]]) +; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[REV]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[EXT]], [[X]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[TRUNC]], i8 0 +; CHECK-NEXT: ret i8 [[SEL]] +; + %trunc = trunc i32 %x to i8 + %rev = call i8 @llvm.bitreverse.i8(i8 %trunc) + %ext = zext i8 %rev to i32 + %cmp = icmp eq i32 %ext, %x + %sel = select i1 %cmp, i8 %trunc, i8 0 + ret i8 %sel +} + +define i16 @select_replacement_loop4(i16 noundef %p_12) { +; CHECK-LABEL: @select_replacement_loop4( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i16 [[P_12:%.*]], 2 +; CHECK-NEXT: [[AND1:%.*]] = and i16 [[P_12]], 1 +; CHECK-NEXT: [[AND2:%.*]] = select i1 [[CMP1]], i16 [[AND1]], i16 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i16 [[AND2]], [[P_12]] +; CHECK-NEXT: [[AND3:%.*]] = select i1 [[CMP2]], i16 [[AND1]], i16 0 +; CHECK-NEXT: ret i16 [[AND3]] +; + %cmp1 = icmp ult i16 %p_12, 2 + %and1 = and i16 %p_12, 1 + %and2 = select i1 %cmp1, i16 %and1, i16 0 + %cmp2 = icmp eq i16 %and2, %p_12 + %and3 = select i1 %cmp2, i16 %and1, i16 0 + ret i16 %and3 +} + define ptr @select_replacement_gep_inbounds(ptr %base, i64 %offset) { ; CHECK-LABEL: @select_replacement_gep_inbounds( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]] @@ -3423,7 +3459,7 @@ define @scalable_sign_bits( %x) { define @scalable_non_zero( %x) { ; CHECK-LABEL: @scalable_non_zero( ; CHECK-NEXT: [[A:%.*]] = or [[X:%.*]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[CMP:%.*]] = icmp ule [[A]], shufflevector ( insertelement ( poison, i32 56, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult [[A]], shufflevector ( insertelement ( poison, i32 57, i64 0), poison, zeroinitializer) ; CHECK-NEXT: ret [[CMP]] ; %a = or %x, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/InstCombine/vscale_cmp.ll b/llvm/test/Transforms/InstCombine/vscale_cmp.ll index a7f8368c5d62c8..b2bfc93da089fc 100644 --- a/llvm/test/Transforms/InstCombine/vscale_cmp.ll +++ b/llvm/test/Transforms/InstCombine/vscale_cmp.ll @@ -3,7 +3,7 @@ define @sge( %x) { ; CHECK-LABEL: @sge( -; CHECK-NEXT: [[CMP:%.*]] = icmp sge [[X:%.*]], zeroinitializer +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt [[X:%.*]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: ret [[CMP]] ; %cmp = icmp sge %x, zeroinitializer diff --git a/llvm/test/Transforms/InstSimplify/pr87042.ll b/llvm/test/Transforms/InstSimplify/pr87042.ll new file mode 100644 index 00000000000000..800d27c9e65043 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/pr87042.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=instsimplify -S | FileCheck %s + +; %or2 cannot be folded into %or1 because %or1 has disjoint. +; TODO: Can we move the logic into InstCombine and drop the disjoint flag? +define i64 @test(i1 %cond, i64 %x) { +; CHECK-LABEL: define i64 @test( +; CHECK-SAME: i1 [[COND:%.*]], i64 [[X:%.*]]) { +; CHECK-NEXT: [[OR1:%.*]] = or disjoint i64 [[X]], 7 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[COND]], i64 [[OR1]], i64 [[X]] +; CHECK-NEXT: [[OR2:%.*]] = or i64 [[SEL1]], 7 +; CHECK-NEXT: ret i64 [[OR2]] +; + %or1 = or disjoint i64 %x, 7 + %sel1 = select i1 %cond, i64 %or1, i64 %x + %or2 = or i64 %sel1, 7 + ret i64 %or2 +} + +define i64 @pr87042(i64 %x) { +; CHECK-LABEL: define i64 @pr87042( +; CHECK-SAME: i64 [[X:%.*]]) { +; CHECK-NEXT: [[AND1:%.*]] = and i64 [[X]], 65535 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 [[AND1]], 0 +; CHECK-NEXT: [[OR1:%.*]] = or disjoint i64 [[X]], 7 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP1]], i64 [[OR1]], i64 [[X]] +; CHECK-NEXT: [[AND2:%.*]] = and i64 [[SEL1]], 16776960 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 [[AND2]], 0 +; CHECK-NEXT: [[OR2:%.*]] = or i64 [[SEL1]], 7 +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i64 [[OR2]], i64 [[SEL1]] +; CHECK-NEXT: ret i64 [[SEL2]] +; + %and1 = and i64 %x, 65535 + %cmp1 = icmp eq i64 %and1, 0 + %or1 = or disjoint i64 %x, 7 + %sel1 = select i1 %cmp1, i64 %or1, i64 %x + %and2 = and i64 %sel1, 16776960 + %cmp2 = icmp eq i64 %and2, 0 + %or2 = or i64 %sel1, 7 + %sel2 = select i1 %cmp2, i64 %or2, i64 %sel1 + ret i64 %sel2 +} diff --git a/llvm/test/Transforms/JumpThreading/pr79175.ll b/llvm/test/Transforms/JumpThreading/pr79175.ll new file mode 100644 index 00000000000000..2c7ee0770cdc73 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/pr79175.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=jump-threading < %s | FileCheck %s + +@f = external global i32 + +; Make sure the value of @f is reloaded prior to the final comparison. +define i32 @test(i64 %idx, i32 %val) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: i64 [[IDX:%.*]], i32 [[VAL:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IDX]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[RETURN:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[F:%.*]] = load i32, ptr @f, align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[F]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[COND_END_THREAD:%.*]], label [[COND_END:%.*]] +; CHECK: cond.end: +; CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[VAL]], 0 +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[CMP_I]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP0:%.*]] +; CHECK: cond.end.thread: +; CHECK-NEXT: br label [[TMP0]] +; CHECK: 0: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[COND_END_THREAD]] ], [ [[VAL]], [[COND_END]] ] +; CHECK-NEXT: [[F_IDX:%.*]] = getelementptr inbounds i32, ptr @f, i64 [[IDX]] +; CHECK-NEXT: store i32 [[TMP1]], ptr [[F_IDX]], align 4 +; CHECK-NEXT: [[F_RELOAD:%.*]] = load i32, ptr @f, align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[F_RELOAD]], 1 +; CHECK-NEXT: br i1 [[CMP3]], label [[RETURN2:%.*]], label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: ret i32 0 +; CHECK: return2: +; CHECK-NEXT: ret i32 1 +; +entry: + %cmp = icmp slt i64 %idx, 1 + br i1 %cmp, label %for.body, label %return + +for.body: + %f = load i32, ptr @f, align 4 + %cmp1 = icmp eq i32 %f, 0 + br i1 %cmp1, label %cond.end, label %cond.false + +cond.false: + br label %cond.end + +cond.end: + %phi = phi i32 [ %val, %cond.false ], [ 1, %for.body ] + %cmp.i = icmp sgt i32 %phi, 0 + %sel = select i1 %cmp.i, i32 0, i32 %phi + %f.idx = getelementptr inbounds i32, ptr @f, i64 %idx + store i32 %sel, ptr %f.idx, align 4 + %f.reload = load i32, ptr @f, align 4 + %cmp3 = icmp slt i32 %f.reload, 1 + br i1 %cmp3, label %return2, label %return + +return: + ret i32 0 + +return2: + ret i32 1 +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll new file mode 100644 index 00000000000000..8a796bb3065b19 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll @@ -0,0 +1,217 @@ +; REQUIRES: asserts +; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop' +; CHECK: Calculating cost of runtime checks: +; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. +; CHECK: Total cost of runtime checks: 4 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %off + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + ret void +} + +define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3 +; CHECK: Total cost of runtime checks: 3 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond27.not = icmp eq i64 %outer.iv.next, %m + br i1 %exitcond27.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 +; CHECK: Total cost of runtime checks: 2 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 3 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1 +; CHECK: Total cost of runtime checks: 1 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, 64 + br i1 %exitcond26.not, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 +; CHECK: Total cost of runtime checks: 2 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] + %mul.us = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] + %add.us = add nuw nsw i64 %inner.iv, %mul.us + %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us + %0 = load i8, ptr %arrayidx.us, align 1 + %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us + %1 = load i8, ptr %arrayidx7.us, align 1 + %add9.us = add i8 %1, %0 + store i8 %add9.us, ptr %arrayidx7.us, align 1 + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %exitcond.not = icmp eq i64 %inner.iv.next, %n + br i1 %exitcond.not, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %exitcond26.not = icmp eq i64 %outer.iv.next, %m + br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0 + +outer.exit: + ret void +} + + +define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks' +; CHECK: Calculating cost of runtime checks: +; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 +; CHECK: Total cost of runtime checks: 2 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ] + %0 = mul nsw i64 %outer.iv, %n + br label %inner.loop + +inner.loop: + %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] + %1 = add nuw nsw i64 %iv.inner, %0 + %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1 + %2 = load i32, ptr %arrayidx.us, align 4 + %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1 + %3 = load i32, ptr %arrayidx8.us, align 4 + %add9.us = add nsw i32 %3, %2 + store i32 %add9.us, ptr %arrayidx8.us, align 4 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %inner.exit.cond = icmp eq i64 %iv.inner.next, %n + br i1 %inner.exit.cond, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %outer.exit.cond = icmp eq i64 %outer.iv.next, 3 + br i1 %outer.exit.cond, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + + +!0 = !{!"branch_weights", i32 10, i32 20} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll index d07f72792e6b99..dd1495626eb984 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|sin|tan|cbrt|erf|exp[^e]|gamma|log|sqrt|copysign|dim|min|mod|hypot|nextafter|pow|fma)" --version 2 -; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON -; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE -; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON -; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=ARMPL-SVE +; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=SLEEF-NEON +; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=SLEEF-SVE +; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefixes=SLEEF-SVE-NOPRED +; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=ARMPL-NEON +; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=ARMPL-SVE +; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefixes=ARMPL-SVE-NOPRED + + target triple = "aarch64-unknown-linux-gnu" @@ -23,6 +27,11 @@ define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_acos( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @acos_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_acos( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @acos(double [[IN:%.*]]) #[[ATTR2:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @acos_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vacosq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -30,6 +39,11 @@ define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @acos_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svacos_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @acos_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svacos_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @acos(double [[IN:%.*]]) #[[ATTR2:[0-9]+]] ; entry: br label %for.body @@ -58,6 +72,11 @@ define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_acosf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @acos_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_acosf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @acosf(float [[IN:%.*]]) #[[ATTR3:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @acos_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vacosq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -65,6 +84,11 @@ define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @acos_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svacos_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @acos_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svacos_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @acosf(float [[IN:%.*]]) #[[ATTR3:[0-9]+]] ; entry: br label %for.body @@ -96,6 +120,11 @@ define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_acosh( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @acosh_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_acosh( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]]) #[[ATTR4:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @acosh_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vacoshq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -103,6 +132,11 @@ define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @acosh_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svacosh_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @acosh_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svacosh_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]]) #[[ATTR4:[0-9]+]] ; entry: br label %for.body @@ -131,6 +165,11 @@ define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_acoshf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @acosh_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_acoshf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]]) #[[ATTR5:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @acosh_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vacoshq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -138,6 +177,11 @@ define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @acosh_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svacosh_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @acosh_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svacosh_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]]) #[[ATTR5:[0-9]+]] ; entry: br label %for.body @@ -169,6 +213,11 @@ define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_asin( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @asin_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_asin( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @asin(double [[IN:%.*]]) #[[ATTR6:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @asin_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vasinq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -176,6 +225,11 @@ define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @asin_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svasin_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @asin_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svasin_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @asin(double [[IN:%.*]]) #[[ATTR6:[0-9]+]] ; entry: br label %for.body @@ -204,6 +258,11 @@ define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_asinf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @asin_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_asinf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @asinf(float [[IN:%.*]]) #[[ATTR7:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @asin_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vasinq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -211,6 +270,11 @@ define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @asin_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svasin_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @asin_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svasin_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @asinf(float [[IN:%.*]]) #[[ATTR7:[0-9]+]] ; entry: br label %for.body @@ -242,6 +306,11 @@ define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_asinh( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @asinh_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_asinh( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]]) #[[ATTR8:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @asinh_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vasinhq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -249,6 +318,11 @@ define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @asinh_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svasinh_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @asinh_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svasinh_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]]) #[[ATTR8:[0-9]+]] ; entry: br label %for.body @@ -277,6 +351,11 @@ define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_asinhf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @asinh_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_asinhf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]]) #[[ATTR9:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @asinh_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vasinhq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -284,6 +363,11 @@ define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @asinh_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svasinh_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @asinh_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svasinh_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]]) #[[ATTR9:[0-9]+]] ; entry: br label %for.body @@ -315,6 +399,11 @@ define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_atan( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @atan_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_atan( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @atan(double [[IN:%.*]]) #[[ATTR10:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @atan_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vatanq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -322,6 +411,11 @@ define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @atan_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svatan_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @atan_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svatan_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @atan(double [[IN:%.*]]) #[[ATTR10:[0-9]+]] ; entry: br label %for.body @@ -350,6 +444,11 @@ define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_atanf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @atan_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_atanf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @atanf(float [[IN:%.*]]) #[[ATTR11:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @atan_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vatanq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -357,6 +456,11 @@ define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @atan_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svatan_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @atan_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svatan_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @atanf(float [[IN:%.*]]) #[[ATTR11:[0-9]+]] ; entry: br label %for.body @@ -388,6 +492,11 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_atan2( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @atan2_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_atan2( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @atan2(double [[IN:%.*]], double [[IN]]) #[[ATTR12:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @atan2_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vatan2q_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -395,6 +504,11 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @atan2_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svatan2_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @atan2_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svatan2_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @atan2(double [[IN:%.*]], double [[IN]]) #[[ATTR12:[0-9]+]] ; entry: br label %for.body @@ -423,6 +537,11 @@ define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_atan2f( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @atan2_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_atan2f( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @atan2f(float [[IN:%.*]], float [[IN]]) #[[ATTR13:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @atan2_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vatan2q_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -430,6 +549,11 @@ define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @atan2_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svatan2_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @atan2_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svatan2_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @atan2f(float [[IN:%.*]], float [[IN]]) #[[ATTR13:[0-9]+]] ; entry: br label %for.body @@ -461,6 +585,11 @@ define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_atanh( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @atanh_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_atanh( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @atanh(double [[IN:%.*]]) #[[ATTR14:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @atanh_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vatanhq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -468,6 +597,11 @@ define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @atanh_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svatanh_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @atanh_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svatanh_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @atanh(double [[IN:%.*]]) #[[ATTR14:[0-9]+]] ; entry: br label %for.body @@ -496,6 +630,11 @@ define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_atanhf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @atanh_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_atanhf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @atanhf(float [[IN:%.*]]) #[[ATTR15:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @atanh_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vatanhq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -503,6 +642,11 @@ define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @atanh_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svatanh_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @atanh_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svatanh_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @atanhf(float [[IN:%.*]]) #[[ATTR15:[0-9]+]] ; entry: br label %for.body @@ -534,6 +678,11 @@ define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_cbrt( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cbrt_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_cbrt( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]]) #[[ATTR16:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cbrt_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vcbrtq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -541,6 +690,11 @@ define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cbrt_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcbrt_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cbrt_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcbrt_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]]) #[[ATTR16:[0-9]+]] ; entry: br label %for.body @@ -569,6 +723,11 @@ define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_cbrtf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cbrt_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_cbrtf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]]) #[[ATTR17:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cbrt_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vcbrtq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -576,6 +735,11 @@ define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cbrt_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcbrt_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cbrt_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcbrt_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]]) #[[ATTR17:[0-9]+]] ; entry: br label %for.body @@ -607,6 +771,11 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_copysign( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @copysign_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_copysign( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]]) #[[ATTR18:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @copysign_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vcopysignq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -614,6 +783,11 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @copysign_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcopysign_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @copysign_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcopysign_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]]) #[[ATTR18:[0-9]+]] ; entry: br label %for.body @@ -642,6 +816,11 @@ define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_copysignf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @copysign_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_copysignf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]]) #[[ATTR19:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @copysign_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vcopysignq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -649,6 +828,11 @@ define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @copysign_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcopysign_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @copysign_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcopysign_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]]) #[[ATTR19:[0-9]+]] ; entry: br label %for.body @@ -680,6 +864,11 @@ define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_cos( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cos_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_cos( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @cos(double [[IN:%.*]]) #[[ATTR20:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cos_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -687,6 +876,11 @@ define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cos_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcos_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cos_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcos_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @cos(double [[IN:%.*]]) #[[ATTR20:[0-9]+]] ; entry: br label %for.body @@ -715,6 +909,11 @@ define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_cosf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cos_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_cosf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @cosf(float [[IN:%.*]]) #[[ATTR21:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cos_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -722,6 +921,11 @@ define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cos_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcos_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cos_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcos_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @cosf(float [[IN:%.*]]) #[[ATTR21:[0-9]+]] ; entry: br label %for.body @@ -753,6 +957,11 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_cosh( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cosh_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_cosh( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @cosh(double [[IN:%.*]]) #[[ATTR22:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cosh_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vcoshq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -760,6 +969,11 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cosh_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcosh_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cosh_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcosh_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @cosh(double [[IN:%.*]]) #[[ATTR22:[0-9]+]] ; entry: br label %for.body @@ -788,6 +1002,11 @@ define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_coshf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cosh_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_coshf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @coshf(float [[IN:%.*]]) #[[ATTR23:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cosh_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vcoshq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -795,6 +1014,11 @@ define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cosh_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcosh_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cosh_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcosh_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @coshf(float [[IN:%.*]]) #[[ATTR23:[0-9]+]] ; entry: br label %for.body @@ -826,6 +1050,11 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_cospi( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cospi_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_cospi( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]]) #[[ATTR24:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cospi_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vcospiq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -833,6 +1062,11 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cospi_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcospi_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cospi_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcospi_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]]) #[[ATTR24:[0-9]+]] ; entry: br label %for.body @@ -861,6 +1095,11 @@ define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_cospif( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @cospi_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_cospif( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]]) #[[ATTR25:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @cospi_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vcospiq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -868,6 +1107,11 @@ define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @cospi_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svcospi_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @cospi_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svcospi_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]]) #[[ATTR25:[0-9]+]] ; entry: br label %for.body @@ -899,6 +1143,11 @@ define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_erf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @erf_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_erf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]]) #[[ATTR26:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @erf_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_verfq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -906,6 +1155,11 @@ define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @erf_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_sverf_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @erf_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_sverf_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]]) #[[ATTR26:[0-9]+]] ; entry: br label %for.body @@ -934,6 +1188,11 @@ define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_erff( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @erf_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_erff( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]]) #[[ATTR27:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @erf_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_verfq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -941,6 +1200,11 @@ define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @erf_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_sverf_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @erf_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_sverf_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]]) #[[ATTR27:[0-9]+]] ; entry: br label %for.body @@ -972,6 +1236,11 @@ define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_erfc( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @erfc_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_erfc( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]]) #[[ATTR28:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @erfc_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_verfcq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -979,6 +1248,11 @@ define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @erfc_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_sverfc_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @erfc_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_sverfc_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]]) #[[ATTR28:[0-9]+]] ; entry: br label %for.body @@ -1007,6 +1281,11 @@ define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_erfcf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @erfc_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_erfcf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]]) #[[ATTR29:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @erfc_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_verfcq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -1014,6 +1293,11 @@ define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @erfc_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_sverfc_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @erfc_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_sverfc_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]]) #[[ATTR29:[0-9]+]] ; entry: br label %for.body @@ -1045,6 +1329,11 @@ define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_exp( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @exp_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_exp( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @exp(double [[IN:%.*]]) #[[ATTR30:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @exp_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -1052,6 +1341,11 @@ define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @exp_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexp_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @exp_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexp_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @exp(double [[IN:%.*]]) #[[ATTR30:[0-9]+]] ; entry: br label %for.body @@ -1080,6 +1374,11 @@ define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_expf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @exp_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_expf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @expf(float [[IN:%.*]]) #[[ATTR31:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @exp_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -1087,6 +1386,11 @@ define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @exp_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexp_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @exp_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexp_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @expf(float [[IN:%.*]]) #[[ATTR31:[0-9]+]] ; entry: br label %for.body @@ -1118,6 +1422,11 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_exp10( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @exp10_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_exp10( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @exp10(double [[IN:%.*]]) #[[ATTR32:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @exp10_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -1125,6 +1434,11 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @exp10_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexp10_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @exp10_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexp10_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @exp10(double [[IN:%.*]]) #[[ATTR32:[0-9]+]] ; entry: br label %for.body @@ -1153,6 +1467,11 @@ define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_exp10f( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @exp10_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_exp10f( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @exp10f(float [[IN:%.*]]) #[[ATTR33:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @exp10_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -1160,6 +1479,11 @@ define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @exp10_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexp10_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @exp10_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexp10_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @exp10f(float [[IN:%.*]]) #[[ATTR33:[0-9]+]] ; entry: br label %for.body @@ -1191,6 +1515,11 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_exp2( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @exp2_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_exp2( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @exp2(double [[IN:%.*]]) #[[ATTR34:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @exp2_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -1198,6 +1527,11 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @exp2_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexp2_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @exp2_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexp2_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @exp2(double [[IN:%.*]]) #[[ATTR34:[0-9]+]] ; entry: br label %for.body @@ -1226,6 +1560,11 @@ define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_exp2f( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @exp2_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_exp2f( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @exp2f(float [[IN:%.*]]) #[[ATTR35:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @exp2_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -1233,6 +1572,11 @@ define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @exp2_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexp2_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @exp2_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexp2_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @exp2f(float [[IN:%.*]]) #[[ATTR35:[0-9]+]] ; entry: br label %for.body @@ -1264,6 +1608,11 @@ define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_expm1( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @expm1_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_expm1( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]]) #[[ATTR36:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @expm1_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vexpm1q_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -1271,6 +1620,11 @@ define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @expm1_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexpm1_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @expm1_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexpm1_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]]) #[[ATTR36:[0-9]+]] ; entry: br label %for.body @@ -1299,6 +1653,11 @@ define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_expm1f( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @expm1_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_expm1f( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]]) #[[ATTR37:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @expm1_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vexpm1q_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -1306,6 +1665,11 @@ define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @expm1_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svexpm1_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @expm1_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svexpm1_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]]) #[[ATTR37:[0-9]+]] ; entry: br label %for.body @@ -1337,6 +1701,11 @@ define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fdim( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fdim_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fdim( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]]) #[[ATTR38:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fdim_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vfdimq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -1344,6 +1713,11 @@ define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fdim_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfdim_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fdim_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfdim_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]]) #[[ATTR38:[0-9]+]] ; entry: br label %for.body @@ -1372,6 +1746,11 @@ define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fdimf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fdim_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fdimf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]]) #[[ATTR39:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fdim_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vfdimq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -1379,6 +1758,11 @@ define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fdim_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfdim_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fdim_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfdim_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]]) #[[ATTR39:[0-9]+]] ; entry: br label %for.body @@ -1410,6 +1794,11 @@ define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvvv_fma( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fma_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvvv_fma( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]]) #[[ATTR40:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fma_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vfmaq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]]) @@ -1417,6 +1806,11 @@ define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fma_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfma_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fma_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfma_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]]) #[[ATTR40:[0-9]+]] ; entry: br label %for.body @@ -1445,6 +1839,11 @@ define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvvv_fmaf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fma_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvvv_fmaf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]]) #[[ATTR41:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fma_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vfmaq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]]) @@ -1452,6 +1851,11 @@ define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fma_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfma_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fma_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfma_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]]) #[[ATTR41:[0-9]+]] ; entry: br label %for.body @@ -1483,6 +1887,11 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fmax( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fmax_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fmax( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]]) #[[ATTR42:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fmax_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vfmaxq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -1490,6 +1899,11 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fmax_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfmax_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fmax_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfmax_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]]) #[[ATTR42:[0-9]+]] ; entry: br label %for.body @@ -1518,6 +1932,11 @@ define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fmaxf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fmax_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fmaxf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]]) #[[ATTR43:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fmax_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vfmaxq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -1525,6 +1944,11 @@ define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fmax_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfmax_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fmax_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfmax_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]]) #[[ATTR43:[0-9]+]] ; entry: br label %for.body @@ -1556,6 +1980,11 @@ define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fmin( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fmin_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fmin( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]]) #[[ATTR44:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fmin_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vfminq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -1563,6 +1992,11 @@ define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fmin_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfmin_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fmin_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfmin_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]]) #[[ATTR44:[0-9]+]] ; entry: br label %for.body @@ -1591,6 +2025,11 @@ define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fminf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fmin_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fminf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]]) #[[ATTR45:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fmin_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vfminq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -1598,6 +2037,11 @@ define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fmin_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfmin_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fmin_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfmin_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]]) #[[ATTR45:[0-9]+]] ; entry: br label %for.body @@ -1629,6 +2073,11 @@ define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fmod( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fmod_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fmod( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]]) #[[ATTR46:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fmod_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vfmodq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -1636,6 +2085,11 @@ define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fmod_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfmod_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fmod_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfmod_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]]) #[[ATTR46:[0-9]+]] ; entry: br label %for.body @@ -1664,6 +2118,11 @@ define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_fmodf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @fmod_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_fmodf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]]) #[[ATTR47:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @fmod_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vfmodq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -1671,6 +2130,11 @@ define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @fmod_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svfmod_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @fmod_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svfmod_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]]) #[[ATTR47:[0-9]+]] ; entry: br label %for.body @@ -1702,6 +2166,11 @@ define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_hypot( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @hypot_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_hypot( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]]) #[[ATTR48:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @hypot_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vhypotq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -1709,6 +2178,11 @@ define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @hypot_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svhypot_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @hypot_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svhypot_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]]) #[[ATTR48:[0-9]+]] ; entry: br label %for.body @@ -1737,6 +2211,11 @@ define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_hypotf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @hypot_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_hypotf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]]) #[[ATTR49:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @hypot_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vhypotq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -1744,6 +2223,11 @@ define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @hypot_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svhypot_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @hypot_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svhypot_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]]) #[[ATTR49:[0-9]+]] ; entry: br label %for.body @@ -1775,6 +2259,11 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_ilogb( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @ilogb_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_ilogb( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]]) #[[ATTR50:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @ilogb_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x i32> @armpl_vilogbq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -1782,6 +2271,11 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @ilogb_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svilogb_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @ilogb_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svilogb_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]]) #[[ATTR50:[0-9]+]] ; entry: br label %for.body @@ -1810,6 +2304,11 @@ define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_ilogbf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @ilogb_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_ilogbf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]]) #[[ATTR51:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @ilogb_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x i32> @armpl_vilogbq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -1817,6 +2316,11 @@ define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @ilogb_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svilogb_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @ilogb_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svilogb_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]]) #[[ATTR51:[0-9]+]] ; entry: br label %for.body @@ -1848,6 +2352,11 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias % ; SLEEF-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP17:%.*]] = call @_ZGVsMxvv_ldexp( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD1:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @ldexp_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP11:%.*]] = call @_ZGVsMxvv_ldexp( [[WIDE_LOAD:%.*]], [[WIDE_LOAD1:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR52:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @ldexp_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vldexpq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x i32> [[WIDE_LOAD1:%.*]]) @@ -1855,6 +2364,11 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias % ; ARMPL-SVE-LABEL: define void @ldexp_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP17:%.*]] = call @armpl_svldexp_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD1:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @ldexp_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP11:%.*]] = call @armpl_svldexp_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD1:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR52:[0-9]+]] ; entry: br label %for.body @@ -1885,6 +2399,11 @@ define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias % ; SLEEF-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP17:%.*]] = call @_ZGVsMxvv_ldexpf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD1:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @ldexp_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP11:%.*]] = call @_ZGVsMxvv_ldexpf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD1:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR53:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @ldexp_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vldexpq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x i32> [[WIDE_LOAD1:%.*]]) @@ -1892,6 +2411,11 @@ define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias % ; ARMPL-SVE-LABEL: define void @ldexp_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP17:%.*]] = call @armpl_svldexp_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD1:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @ldexp_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP11:%.*]] = call @armpl_svldexp_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD1:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR53:[0-9]+]] ; entry: br label %for.body @@ -1925,6 +2449,11 @@ define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_lgamma( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @lgamma_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_lgamma( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]]) #[[ATTR54:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @lgamma_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vlgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -1932,6 +2461,11 @@ define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @lgamma_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlgamma_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @lgamma_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlgamma_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]]) #[[ATTR54:[0-9]+]] ; entry: br label %for.body @@ -1960,6 +2494,11 @@ define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_lgammaf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @lgamma_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_lgammaf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]]) #[[ATTR55:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @lgamma_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vlgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -1967,6 +2506,11 @@ define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @lgamma_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlgamma_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @lgamma_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlgamma_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]]) #[[ATTR55:[0-9]+]] ; entry: br label %for.body @@ -1998,6 +2542,11 @@ define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_log( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_log( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @log(double [[IN:%.*]]) #[[ATTR56:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2005,6 +2554,11 @@ define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @log(double [[IN:%.*]]) #[[ATTR56:[0-9]+]] ; entry: br label %for.body @@ -2033,6 +2587,11 @@ define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_logf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_logf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @logf(float [[IN:%.*]]) #[[ATTR57:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2040,6 +2599,11 @@ define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @logf(float [[IN:%.*]]) #[[ATTR57:[0-9]+]] ; entry: br label %for.body @@ -2071,6 +2635,11 @@ define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_log10( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log10_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_log10( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @log10(double [[IN:%.*]]) #[[ATTR58:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log10_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2078,6 +2647,11 @@ define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log10_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog10_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log10_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog10_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @log10(double [[IN:%.*]]) #[[ATTR58:[0-9]+]] ; entry: br label %for.body @@ -2106,6 +2680,11 @@ define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_log10f( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log10_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_log10f( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @log10f(float [[IN:%.*]]) #[[ATTR59:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log10_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2113,6 +2692,11 @@ define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log10_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog10_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log10_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog10_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @log10f(float [[IN:%.*]]) #[[ATTR59:[0-9]+]] ; entry: br label %for.body @@ -2144,6 +2728,11 @@ define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_log1p( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log1p_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_log1p( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]]) #[[ATTR60:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log1p_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vlog1pq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2151,6 +2740,11 @@ define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log1p_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog1p_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log1p_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog1p_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]]) #[[ATTR60:[0-9]+]] ; entry: br label %for.body @@ -2179,6 +2773,11 @@ define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_log1pf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log1p_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_log1pf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]]) #[[ATTR61:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log1p_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vlog1pq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2186,6 +2785,11 @@ define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log1p_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog1p_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log1p_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog1p_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]]) #[[ATTR61:[0-9]+]] ; entry: br label %for.body @@ -2217,6 +2821,11 @@ define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_log2( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log2_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_log2( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @log2(double [[IN:%.*]]) #[[ATTR62:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log2_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2224,6 +2833,11 @@ define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log2_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog2_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log2_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog2_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @log2(double [[IN:%.*]]) #[[ATTR62:[0-9]+]] ; entry: br label %for.body @@ -2252,6 +2866,11 @@ define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_log2f( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @log2_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_log2f( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @log2f(float [[IN:%.*]]) #[[ATTR63:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @log2_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2259,6 +2878,11 @@ define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @log2_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svlog2_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @log2_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svlog2_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @log2f(float [[IN:%.*]]) #[[ATTR63:[0-9]+]] ; entry: br label %for.body @@ -2288,7 +2912,12 @@ define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; SLEEF-SVE-LABEL: define void @modf_f64 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: [[TMP23:%.*]] = call @_ZGVsMxvl8_modf( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SLEEF-SVE: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; +; SLEEF-SVE-NOPRED-LABEL: define void @modf_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP17:%.*]] = call @_ZGVsNxvl8_modf( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]]) +; SLEEF-SVE-NOPRED: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR64:[0-9]+]] ; ; ARMPL-NEON-LABEL: define void @modf_f64 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { @@ -2296,7 +2925,11 @@ define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; ARMPL-SVE-LABEL: define void @modf_f64 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: [[TMP23:%.*]] = call @armpl_svmodf_f64_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; +; ARMPL-SVE-NOPRED-LABEL: define void @modf_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP5:%.*]] = call <2 x double> @armpl_vmodfq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) ; entry: br label %for.body @@ -2324,7 +2957,12 @@ define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; SLEEF-SVE-LABEL: define void @modf_f32 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: [[TMP23:%.*]] = call @_ZGVsMxvl4_modff( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SLEEF-SVE: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; +; SLEEF-SVE-NOPRED-LABEL: define void @modf_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP17:%.*]] = call @_ZGVsNxvl4_modff( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]]) +; SLEEF-SVE-NOPRED: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR65:[0-9]+]] ; ; ARMPL-NEON-LABEL: define void @modf_f32 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { @@ -2332,7 +2970,11 @@ define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; ARMPL-SVE-LABEL: define void @modf_f32 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: [[TMP23:%.*]] = call @armpl_svmodf_f32_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; +; ARMPL-SVE-NOPRED-LABEL: define void @modf_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP5:%.*]] = call <4 x float> @armpl_vmodfq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) ; entry: br label %for.body @@ -2365,6 +3007,11 @@ define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_nextafter( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @nextafter_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_nextafter( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]]) #[[ATTR66:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @nextafter_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vnextafterq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -2372,6 +3019,11 @@ define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @nextafter_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svnextafter_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @nextafter_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svnextafter_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]]) #[[ATTR64:[0-9]+]] ; entry: br label %for.body @@ -2400,6 +3052,11 @@ define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_nextafterf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @nextafter_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_nextafterf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]]) #[[ATTR67:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @nextafter_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vnextafterq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -2407,6 +3064,11 @@ define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @nextafter_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svnextafter_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @nextafter_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svnextafter_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]]) #[[ATTR65:[0-9]+]] ; entry: br label %for.body @@ -2438,6 +3100,11 @@ define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_pow( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @pow_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_pow( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @pow(double [[IN:%.*]], double [[IN]]) #[[ATTR68:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @pow_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]]) @@ -2445,6 +3112,11 @@ define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @pow_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svpow_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @pow_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svpow_f64_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @pow(double [[IN:%.*]], double [[IN]]) #[[ATTR66:[0-9]+]] ; entry: br label %for.body @@ -2473,6 +3145,11 @@ define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxvv_powf( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @pow_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxvv_powf( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @powf(float [[IN:%.*]], float [[IN]]) #[[ATTR69:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @pow_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]]) @@ -2480,6 +3157,11 @@ define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @pow_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svpow_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @pow_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svpow_f32_x( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @powf(float [[IN:%.*]], float [[IN]]) #[[ATTR67:[0-9]+]] ; entry: br label %for.body @@ -2511,6 +3193,11 @@ define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sin( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sin_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sin( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @sin(double [[IN:%.*]]) #[[ATTR70:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sin_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2518,6 +3205,11 @@ define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sin_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsin_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sin_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsin_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @sin(double [[IN:%.*]]) #[[ATTR68:[0-9]+]] ; entry: br label %for.body @@ -2546,6 +3238,11 @@ define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sinf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sin_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sinf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @sinf(float [[IN:%.*]]) #[[ATTR71:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sin_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2553,6 +3250,11 @@ define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sin_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsin_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sin_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsin_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @sinf(float [[IN:%.*]]) #[[ATTR69:[0-9]+]] ; entry: br label %for.body @@ -2582,7 +3284,12 @@ define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; SLEEF-SVE-LABEL: define void @sincos_f64 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @_ZGVsMxvl8l8_sincos( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SLEEF-SVE: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR6:[0-9]+]] +; +; SLEEF-SVE-NOPRED-LABEL: define void @sincos_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl8l8_sincos( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) +; SLEEF-SVE-NOPRED: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR72:[0-9]+]] ; ; ARMPL-NEON-LABEL: define void @sincos_f64 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { @@ -2590,7 +3297,11 @@ define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; ARMPL-SVE-LABEL: define void @sincos_f64 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincos_f64_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR6:[0-9]+]] +; +; ARMPL-SVE-NOPRED-LABEL: define void @sincos_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: call void @armpl_vsincosq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) ; entry: br label %for.body @@ -2617,7 +3328,12 @@ define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; SLEEF-SVE-LABEL: define void @sincos_f32 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @_ZGVsMxvl4l4_sincosf( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SLEEF-SVE: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR7:[0-9]+]] +; +; SLEEF-SVE-NOPRED-LABEL: define void @sincos_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl4l4_sincosf( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) +; SLEEF-SVE-NOPRED: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR73:[0-9]+]] ; ; ARMPL-NEON-LABEL: define void @sincos_f32 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { @@ -2625,7 +3341,11 @@ define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; ARMPL-SVE-LABEL: define void @sincos_f32 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincos_f32_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR7:[0-9]+]] +; +; ARMPL-SVE-NOPRED-LABEL: define void @sincos_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: call void @armpl_vsincosq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) ; entry: br label %for.body @@ -2655,7 +3375,12 @@ define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; SLEEF-SVE-LABEL: define void @sincospi_f64 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @_ZGVsMxvl8l8_sincospi( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SLEEF-SVE: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR8:[0-9]+]] +; +; SLEEF-SVE-NOPRED-LABEL: define void @sincospi_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl8l8_sincospi( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) +; SLEEF-SVE-NOPRED: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR74:[0-9]+]] ; ; ARMPL-NEON-LABEL: define void @sincospi_f64 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { @@ -2663,7 +3388,11 @@ define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; ARMPL-SVE-LABEL: define void @sincospi_f64 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincospi_f64_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR8:[0-9]+]] +; +; ARMPL-SVE-NOPRED-LABEL: define void @sincospi_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: call void @armpl_vsincospiq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) ; entry: br label %for.body @@ -2690,7 +3419,12 @@ define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; SLEEF-SVE-LABEL: define void @sincospi_f32 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @_ZGVsMxvl4l4_sincospif( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SLEEF-SVE: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR9:[0-9]+]] +; +; SLEEF-SVE-NOPRED-LABEL: define void @sincospi_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl4l4_sincospif( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) +; SLEEF-SVE-NOPRED: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR75:[0-9]+]] ; ; ARMPL-NEON-LABEL: define void @sincospi_f32 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { @@ -2698,7 +3432,11 @@ define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; ; ARMPL-SVE-LABEL: define void @sincospi_f32 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincospi_f32_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR9:[0-9]+]] +; +; ARMPL-SVE-NOPRED-LABEL: define void @sincospi_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: call void @armpl_vsincospiq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) ; entry: br label %for.body @@ -2730,6 +3468,11 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sinh( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sinh_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sinh( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @sinh(double [[IN:%.*]]) #[[ATTR76:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sinh_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vsinhq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2737,6 +3480,11 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sinh_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsinh_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sinh_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsinh_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @sinh(double [[IN:%.*]]) #[[ATTR70:[0-9]+]] ; entry: br label %for.body @@ -2765,6 +3513,11 @@ define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sinhf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sinh_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sinhf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @sinhf(float [[IN:%.*]]) #[[ATTR77:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sinh_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vsinhq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2772,6 +3525,11 @@ define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sinh_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsinh_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sinh_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsinh_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @sinhf(float [[IN:%.*]]) #[[ATTR71:[0-9]+]] ; entry: br label %for.body @@ -2803,6 +3561,11 @@ define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sinpi( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sinpi_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sinpi( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]]) #[[ATTR78:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sinpi_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vsinpiq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2810,6 +3573,11 @@ define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sinpi_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsinpi_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sinpi_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsinpi_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]]) #[[ATTR72:[0-9]+]] ; entry: br label %for.body @@ -2838,6 +3606,11 @@ define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sinpif( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sinpi_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sinpif( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]]) #[[ATTR79:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sinpi_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vsinpiq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2845,6 +3618,11 @@ define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sinpi_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsinpi_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sinpi_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsinpi_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]]) #[[ATTR73:[0-9]+]] ; entry: br label %for.body @@ -2876,6 +3654,11 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sqrt( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sqrt_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sqrt( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]]) #[[ATTR80:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sqrt_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vsqrtq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2883,6 +3666,11 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sqrt_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsqrt_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sqrt_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsqrt_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]]) #[[ATTR74:[0-9]+]] ; entry: br label %for.body @@ -2911,6 +3699,11 @@ define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_sqrtf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @sqrt_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_sqrtf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]]) #[[ATTR81:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @sqrt_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vsqrtq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2918,6 +3711,11 @@ define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @sqrt_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svsqrt_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @sqrt_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svsqrt_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]]) #[[ATTR75:[0-9]+]] ; entry: br label %for.body @@ -2949,6 +3747,11 @@ define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_tan( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @tan_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_tan( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @tan(double [[IN:%.*]]) #[[ATTR82:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @tan_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vtanq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -2956,6 +3759,11 @@ define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @tan_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svtan_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @tan_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svtan_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @tan(double [[IN:%.*]]) #[[ATTR76:[0-9]+]] ; entry: br label %for.body @@ -2984,6 +3792,11 @@ define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_tanf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @tan_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_tanf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @tanf(float [[IN:%.*]]) #[[ATTR83:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @tan_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vtanq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -2991,6 +3804,11 @@ define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @tan_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svtan_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @tan_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svtan_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @tanf(float [[IN:%.*]]) #[[ATTR77:[0-9]+]] ; entry: br label %for.body @@ -3022,6 +3840,11 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_tanh( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @tanh_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_tanh( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @tanh(double [[IN:%.*]]) #[[ATTR84:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @tanh_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vtanhq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -3029,6 +3852,11 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @tanh_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svtanh_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @tanh_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svtanh_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @tanh(double [[IN:%.*]]) #[[ATTR78:[0-9]+]] ; entry: br label %for.body @@ -3057,6 +3885,11 @@ define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_tanhf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @tanh_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_tanhf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @tanhf(float [[IN:%.*]]) #[[ATTR85:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @tanh_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vtanhq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -3064,6 +3897,11 @@ define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @tanh_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svtanh_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @tanh_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svtanh_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @tanhf(float [[IN:%.*]]) #[[ATTR79:[0-9]+]] ; entry: br label %for.body @@ -3095,6 +3933,11 @@ define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_tgamma( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @tgamma_f64 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_tgamma( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]]) #[[ATTR86:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @tgamma_f64 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <2 x double> @armpl_vtgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]]) @@ -3102,6 +3945,11 @@ define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @tgamma_f64 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svtgamma_f64_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @tgamma_f64 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svtgamma_f64_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]]) #[[ATTR80:[0-9]+]] ; entry: br label %for.body @@ -3130,6 +3978,11 @@ define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; SLEEF-SVE: [[TMP15:%.*]] = call @_ZGVsMxv_tgammaf( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; +; SLEEF-SVE-NOPRED-LABEL: define void @tgamma_f32 +; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; SLEEF-SVE-NOPRED: [[TMP9:%.*]] = call @_ZGVsMxv_tgammaf( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SLEEF-SVE-NOPRED: [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]]) #[[ATTR87:[0-9]+]] +; ; ARMPL-NEON-LABEL: define void @tgamma_f32 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-NEON: [[TMP3:%.*]] = call <4 x float> @armpl_vtgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]]) @@ -3137,6 +3990,11 @@ define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; ARMPL-SVE-LABEL: define void @tgamma_f32 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { ; ARMPL-SVE: [[TMP15:%.*]] = call @armpl_svtgamma_f32_x( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; +; ARMPL-SVE-NOPRED-LABEL: define void @tgamma_f32 +; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] { +; ARMPL-SVE-NOPRED: [[TMP9:%.*]] = call @armpl_svtgamma_f32_x( [[WIDE_LOAD:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; ARMPL-SVE-NOPRED: [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]]) #[[ATTR81:[0-9]+]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll index 29440ca174248f..f60ab5e848dd3a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll @@ -21,7 +21,8 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP5:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]]) -; NEON_INTERLEAVE: [[TMP6:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP6]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8 @@ -34,8 +35,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; SVE_OR_NEON_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP47:%.*]] = extractelement [[TMP45:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP35]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP48:%.*]] = extractelement [[TMP46:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8 @@ -49,8 +51,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; SVE_TF_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP47:%.*]] = extractelement [[TMP45:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP35]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP48:%.*]] = extractelement [[TMP46:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; entry: @@ -81,7 +84,8 @@ define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly % ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) -; NEON_INTERLEAVE: [[TMP10:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP7:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP10]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_vector_linear4 @@ -176,7 +180,8 @@ define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n) ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP4:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]]) -; NEON_INTERLEAVE: [[TMP8:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]]) +; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP5:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP9:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP8]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] ; ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride @@ -228,7 +233,9 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly ; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP8:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]]) -; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]]) +; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP5:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP9]], ptr [[TMP10]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8 @@ -243,8 +250,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly ; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP50:%.*]] = extractelement [[TMP48:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP34:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP52:%.*]] = extractelement [[TMP50:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear4_linear8 @@ -260,8 +269,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly ; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP50:%.*]] = extractelement [[TMP48:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP34:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP52:%.*]] = extractelement [[TMP50:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] ; entry: @@ -293,7 +304,8 @@ define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) { ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]]) -; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP6]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr @@ -343,7 +355,8 @@ define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) { ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]]) -; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP6]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride @@ -393,7 +406,8 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) ; NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6:%.*]], i32 0 ; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) -; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP7:%.*]], i32 0 +; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP9]]) ; NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8_return_void @@ -406,8 +420,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP45:%.*]] = extractelement [[TMP43:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = extractelement [[TMP38:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP46:%.*]] = extractelement [[TMP44:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8_return_void @@ -421,8 +436,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 ; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP45:%.*]] = extractelement [[TMP43:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = extractelement [[TMP38:%.*]], i32 0 +; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP46:%.*]] = extractelement [[TMP44:%.*]], i32 0 ; SVE_TF_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] ; entry: diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll new file mode 100644 index 00000000000000..034af89112cb63 --- /dev/null +++ b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll @@ -0,0 +1,68 @@ +; REQUIRES: aarch64-registered-target + +; RUN: opt -passes=lowertypetests %s -o %t.o +; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-foobar +; CHECK-foobar: {{llvm.global.annotations = .*[foo|bar], .*[foo|bar],}} +; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-cfi +; CHECK-cfi-NOT: {{llvm.global.annotations = .*cfi.*}} + +target triple = "aarch64-none-linux-gnu" + +@.src = private unnamed_addr constant [7 x i8] c"test.c\00", align 1 +@.str = private unnamed_addr constant [30 x i8] c"annotation_string_literal_bar\00", section "llvm.metadata" +@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", section "llvm.metadata" +@.str.2 = private unnamed_addr constant [30 x i8] c"annotation_string_literal_foo\00", section "llvm.metadata" +@llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @bar, ptr @.str, ptr @.str.1, i32 2, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @foo, ptr @.str.2, ptr @.str.1, i32 1, ptr null }], section "llvm.metadata" + +define i32 @bar(i32 noundef %0) #0 !type !8 !type !9 { + %2 = alloca i32, align 4 + store i32 %0, ptr %2, align 4 + %3 = load i32, ptr %2, align 4 + %4 = call i32 @foo(i32 noundef %3) + ret i32 %4 +} + +declare !type !8 !type !9 i32 @foo(i32 noundef) #1 + +define i32 @test(i32 noundef %0) #0 !type !8 !type !9 { + %2 = alloca i32, align 4 + %3 = alloca ptr, align 8 + store i32 %0, ptr %2, align 4 + %4 = load i32, ptr %2, align 4 + %5 = icmp sgt i32 %4, 0 + %6 = zext i1 %5 to i64 + %7 = select i1 %5, ptr @foo, ptr @bar + store ptr %7, ptr %3, align 8 + %8 = load ptr, ptr %3, align 8 + %9 = call i1 @llvm.type.test(ptr %8, metadata !"_ZTSFiiE"), !nosanitize !10 + br i1 %9, label %11, label %10, !nosanitize !10 + +10: + call void @llvm.ubsantrap(i8 2) #4, !nosanitize !10 + unreachable, !nosanitize !10 + +11: + %12 = load i32, ptr %2, align 4 + %13 = call i32 %8(i32 noundef %12) + ret i32 %13 +} + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.ubsantrap(i8 immarg) + +attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" } +attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" } +attributes #4 = { noreturn nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 4, !"CFI Canonical Jump Tables", i32 0} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 1} +!5 = !{i32 1, !"ThinLTO", i32 0} +!6 = !{i32 1, !"EnableSplitLTOUnit", i32 1} +!8 = !{i64 0, !"_ZTSFiiE"} +!9 = !{i64 0, !"_ZTSFiiE.generalized"} +!10 = !{} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll new file mode 100644 index 00000000000000..5c85c0d21f59f7 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -O3 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define internal void @acc(ptr noalias noundef %val, ptr noalias noundef %prev) { +entry: + %0 = load i8, ptr %prev, align 1 + %conv = zext i8 %0 to i32 + %1 = load i8, ptr %val, align 1 + %conv1 = zext i8 %1 to i32 + %add = add nsw i32 %conv1, %conv + %conv2 = trunc i32 %add to i8 + store i8 %conv2, ptr %val, align 1 + ret void +} + +; This loop should not get vectorized. +define void @accsum(ptr noundef %vals, i64 noundef %num) #0 { +; CHECK-LABEL: define void @accsum( +; CHECK-SAME: ptr nocapture noundef [[VALS:%.*]], i64 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[NUM]], 1 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[LOAD_INITIAL:%.*]] = load i8, ptr [[VALS]], align 1 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[STORE_FORWARDED:%.*]] = phi i8 [ [[LOAD_INITIAL]], [[FOR_BODY_PREHEADER]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_02:%.*]] = phi i64 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[I_02]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: [[ADD_I]] = add i8 [[TMP0]], [[STORE_FORWARDED]] +; CHECK-NEXT: store i8 [[ADD_I]], ptr [[ARRAYIDX]], align 1, !alias.scope [[META0]], !noalias [[META3]] +; CHECK-NEXT: [[INC]] = add nuw i64 [[I_02]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUM]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i64 [ 1, %entry ], [ %inc, %for.inc ] + %cmp = icmp ult i64 %i.0, %num + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + br label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds i8, ptr %vals, i64 %i.0 + %sub = sub i64 %i.0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %vals, i64 %sub + call void @acc(ptr noundef %arrayidx, ptr noundef %arrayidx1) + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add i64 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +attributes #0 = { "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87"} +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"acc: %val"} +; CHECK: [[META2]] = distinct !{[[META2]], !"acc"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META2]], !"acc: %prev"} +;. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/call-arg-reduced-by-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/call-arg-reduced-by-minbitwidth.ll new file mode 100644 index 00000000000000..49e89feb475b95 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/call-arg-reduced-by-minbitwidth.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-pc-windows-msvc19.34.0 < %s | FileCheck %s + +define void @test(ptr %0, i8 %1, i1 %cmp12.i) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[TMP0:%.*]], i8 [[TMP1:%.*]], i1 [[CMP12_I:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i1> poison, i1 [[CMP12_I]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label [[PRE:%.*]] +; CHECK: pre: +; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP6]], <8 x i32> ) +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <8 x i32> [[TMP9]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP0]], align 1 +; CHECK-NEXT: br label [[PRE]] +; +entry: + %idx11 = getelementptr i8, ptr %0, i64 1 + %idx22 = getelementptr i8, ptr %0, i64 2 + %idx33 = getelementptr i8, ptr %0, i64 3 + %idx44 = getelementptr i8, ptr %0, i64 4 + %idx55 = getelementptr i8, ptr %0, i64 5 + %idx66 = getelementptr i8, ptr %0, i64 6 + %idx77 = getelementptr i8, ptr %0, i64 7 + br label %pre + +pre: + %conv.i = zext i8 %1 to i32 + %2 = tail call i32 @llvm.umax.i32(i32 %conv.i, i32 1) + %.sroa.speculated.i = add i32 %2, 1 + %intensity.0.i = select i1 %cmp12.i, i32 %.sroa.speculated.i, i32 %conv.i + %conv14.i = trunc i32 %intensity.0.i to i8 + store i8 %conv14.i, ptr %0, align 1 + %conv.i.1 = zext i8 %1 to i32 + %3 = tail call i32 @llvm.umax.i32(i32 %conv.i.1, i32 1) + %ss1 = add i32 %3, 1 + %ii1 = select i1 %cmp12.i, i32 %ss1, i32 %conv.i.1 + %conv14.i.1 = trunc i32 %ii1 to i8 + store i8 %conv14.i.1, ptr %idx11, align 1 + %conv.i.2 = zext i8 %1 to i32 + %4 = tail call i32 @llvm.umax.i32(i32 %conv.i.2, i32 1) + %ss2 = add i32 %4, 1 + %ii2 = select i1 %cmp12.i, i32 %ss2, i32 %conv.i.2 + %conv14.i.2 = trunc i32 %ii2 to i8 + store i8 %conv14.i.2, ptr %idx22, align 1 + %conv.i.3 = zext i8 %1 to i32 + %5 = tail call i32 @llvm.umax.i32(i32 %conv.i.3, i32 1) + %ss3 = add i32 %5, 1 + %ii3 = select i1 %cmp12.i, i32 %ss3, i32 %conv.i.3 + %conv14.i.3 = trunc i32 %ii3 to i8 + store i8 %conv14.i.3, ptr %idx33, align 1 + %conv.i.4 = zext i8 %1 to i32 + %6 = tail call i32 @llvm.umax.i32(i32 %conv.i.4, i32 1) + %ss4 = add i32 %6, 1 + %ii4 = select i1 %cmp12.i, i32 %ss4, i32 %conv.i.4 + %conv14.i.4 = trunc i32 %ii4 to i8 + store i8 %conv14.i.4, ptr %idx44, align 1 + %conv.i.5 = zext i8 %1 to i32 + %7 = tail call i32 @llvm.umax.i32(i32 %conv.i.5, i32 1) + %ss5 = add i32 %7, 1 + %ii5 = select i1 %cmp12.i, i32 %ss5, i32 %conv.i.5 + %conv14.i.5 = trunc i32 %ii5 to i8 + store i8 %conv14.i.5, ptr %idx55, align 1 + %conv.i.6 = zext i8 %1 to i32 + %8 = tail call i32 @llvm.umax.i32(i32 %conv.i.6, i32 1) + %ss6 = add i32 %8, 1 + %ii6 = select i1 %cmp12.i, i32 %ss6, i32 %conv.i.6 + %conv14.i.6 = trunc i32 %ii6 to i8 + store i8 %conv14.i.6, ptr %idx66, align 1 + %conv.i.7 = zext i8 %1 to i32 + %9 = tail call i32 @llvm.umax.i32(i32 %conv.i.7, i32 1) + %ss7 = add i32 %9, 1 + %ii7 = select i1 %cmp12.i, i32 %ss7, i32 %conv.i.7 + %conv14.i.7 = trunc i32 %ii7 to i8 + store i8 %conv14.i.7, ptr %idx77, align 1 + br label %pre +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll new file mode 100644 index 00000000000000..ba406c8f20bb08 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 < %s | FileCheck %s + +define void @test(double %i) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 7 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fcmp ult <8 x double> [[TMP13]], zeroinitializer +; CHECK-NEXT: br label [[BB116:%.*]] +; CHECK: bb116: +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[TMP15]], i32 1 +; CHECK-NEXT: [[I120:%.*]] = fadd double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul <2 x double> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 +; CHECK-NEXT: [[I128:%.*]] = fadd double [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[I139:%.*]] = call double @llvm.maxnum.f64(double [[I128]], double 0.000000e+00) +; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP22]], <2 x double> zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = fptosi <2 x double> [[TMP24]] to <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub <2 x i32> zeroinitializer, [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[I147:%.*]] = fcmp ogt double [[I120]], 0.000000e+00 +; CHECK-NEXT: ret void +; +bb: + %i74 = fsub double 0.000000e+00, poison + %i75 = fsub double 0.000000e+00, %i + %i76 = fmul double 0.000000e+00, %i75 + %i77 = fadd double %i76, 0.000000e+00 + %i78 = fadd double %i77, 0.000000e+00 + %i79 = fcmp ult double %i78, 0.000000e+00 + %i81 = fsub double %i, 0.000000e+00 + %i82 = fmul double 0.000000e+00, %i81 + %i83 = fadd double 0.000000e+00, %i82 + %i84 = fadd double %i83, 0.000000e+00 + %i85 = fcmp ult double %i84, 0.000000e+00 + %i86 = fsub double 0.000000e+00, %i + %i87 = fmul double 0.000000e+00, %i86 + %i88 = fadd double %i87, 0.000000e+00 + %i89 = fadd double %i88, 0.000000e+00 + %i90 = fcmp ult double %i89, 0.000000e+00 + %i91 = fsub double 0.000000e+00, 0.000000e+00 + %i92 = fmul double 0.000000e+00, 0.000000e+00 + %i93 = fadd double %i92, 0.000000e+00 + %i94 = fadd double %i93, 0.000000e+00 + %i95 = fcmp ult double %i94, 0.000000e+00 + %i96 = fsub double poison, 0.000000e+00 + %i97 = fadd double %i77, 0.000000e+00 + %i98 = fcmp ult double %i97, 0.000000e+00 + %i99 = fadd double %i83, 0.000000e+00 + %i100 = fcmp ult double %i99, 0.000000e+00 + %i101 = fmul double 0.000000e+00, 0.000000e+00 + %i102 = fadd double %i101, 0.000000e+00 + %i103 = fadd double %i102, 0.000000e+00 + %i104 = fcmp ult double %i103, 0.000000e+00 + %i105 = fmul double 0.000000e+00, 0.000000e+00 + %i106 = fadd double %i105, 0.000000e+00 + %i107 = fadd double %i106, 0.000000e+00 + %i108 = fcmp ult double %i107, 0.000000e+00 + br label %bb116 + +bb116: + %i117 = fmul double 0.000000e+00, %i81 + %i119 = fmul double 0.000000e+00, %i96 + %i120 = fadd double %i117, %i119 + %i121 = fmul double 0.000000e+00, %i74 + %i122 = fmul double 0.000000e+00, %i75 + %i123 = fadd double %i122, 0.000000e+00 + %i124 = fmul double 0.000000e+00, %i91 + %i125 = fadd double %i124, 0.000000e+00 + %i127 = fmul double 0.000000e+00, %i86 + %i128 = fadd double %i127, %i121 + %i133 = call double @llvm.maxnum.f64(double %i123, double 0.000000e+00) + %i134 = fmul double %i133, 0.000000e+00 + %i135 = fptosi double %i134 to i32 + %i136 = sub i32 0, %i135 + %i137 = icmp sgt i32 %i136, 0 + %i139 = call double @llvm.maxnum.f64(double %i128, double 0.000000e+00) + %i142 = call double @llvm.maxnum.f64(double %i125, double 0.000000e+00) + %i143 = fmul double %i142, 0.000000e+00 + %i144 = fptosi double %i143 to i32 + %i145 = sub i32 0, %i144 + %i146 = icmp sgt i32 %i145, 0 + %i147 = fcmp ogt double %i120, 0.000000e+00 + ret void +} + +declare double @llvm.maxnum.f64(double, double) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll new file mode 100644 index 00000000000000..f665dac3282b79 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 -S < %s | FileCheck %s + +define void @foo(double %i) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> , double [[I]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP5]], i32 6 +; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = freeze <8 x i1> [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]]) +; CHECK-NEXT: br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]] +; CHECK: bb115: +; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 +; CHECK-NEXT: [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP22]], <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = fmul <4 x double> [[TMP27]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fptosi <4 x double> [[TMP28]] to <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = or <4 x i32> zeroinitializer, [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP30]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP31]], 32000 +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP31]], i32 32000 +; CHECK-NEXT: [[I163:%.*]] = fcmp ogt double [[I118]], 0.000000e+00 +; CHECK-NEXT: [[I164:%.*]] = icmp slt i32 0, [[OP_RDX1]] +; CHECK-NEXT: unreachable +; CHECK: bb58: +; CHECK-NEXT: ret void +; +bb: + %i75 = fsub double 0.000000e+00, 0.000000e+00 + %i76 = fsub double 0.000000e+00, 0.000000e+00 + %i77 = fmul double 0.000000e+00, %i75 + %i78 = fmul double 0.000000e+00, %i76 + %i79 = fadd double %i78, 0.000000e+00 + %i80 = fadd double %i79, 0.000000e+00 + %i81 = fcmp ult double %i80, 0.000000e+00 + %i82 = fsub double 0.000000e+00, poison + %i83 = fmul double 0.000000e+00, %i82 + %i84 = fadd double 0.000000e+00, %i83 + %i85 = fadd double %i84, 0.000000e+00 + %i86 = fcmp ult double %i85, 0.000000e+00 + %i87 = fsub double 0.000000e+00, %i + %i88 = fadd double 0.000000e+00, %i77 + %i89 = fadd double %i88, 0.000000e+00 + %i90 = fcmp ult double %i89, 0.000000e+00 + %i91 = fsub double 0.000000e+00, 0.000000e+00 + %i92 = fmul double poison, 0.000000e+00 + %i93 = fadd double %i92, 0.000000e+00 + %i94 = fadd double %i93, 0.000000e+00 + %i95 = fcmp ult double %i94, 0.000000e+00 + %i96 = fadd double %i79, 0.000000e+00 + %i97 = fcmp ult double %i96, 0.000000e+00 + %i98 = fadd double %i84, 0.000000e+00 + %i99 = fcmp ult double %i98, 0.000000e+00 + %i100 = fadd double 0.000000e+00, %i77 + %i101 = fadd double %i100, 0.000000e+00 + %i102 = fcmp ult double %i101, 0.000000e+00 + %i103 = fsub double 0.000000e+00, %i + %i104 = fmul double poison, 0.000000e+00 + %i105 = fadd double %i104, 0.000000e+00 + %i106 = fadd double %i105, 0.000000e+00 + %i107 = fcmp ult double %i106, 0.000000e+00 + %i108 = select i1 %i107, i1 %i102, i1 false + %i109 = select i1 %i108, i1 %i99, i1 false + %i110 = select i1 %i109, i1 %i97, i1 false + %i111 = select i1 %i110, i1 %i95, i1 false + %i112 = select i1 %i111, i1 %i90, i1 false + %i113 = select i1 %i112, i1 %i86, i1 false + %i114 = select i1 %i113, i1 %i81, i1 false + br i1 %i114, label %bb58, label %bb115 + +bb115: + %i116 = fmul double 0.000000e+00, %i103 + %i117 = fmul double 0.000000e+00, %i82 + %i118 = fadd double %i116, %i117 + %i120 = fmul double 0.000000e+00, %i75 + %i121 = fmul double 0.000000e+00, %i76 + %i122 = fadd double %i121, 0.000000e+00 + %i123 = fadd double 0.000000e+00, %i120 + %i124 = fmul double 0.000000e+00, %i91 + %i125 = fadd double %i124, %i82 + %i126 = fadd double %i125, 0.000000e+00 + %i127 = fmul double 0.000000e+00, %i87 + %i128 = fadd double %i127, 0.000000e+00 + %i129 = fadd double %i128, 0.000000e+00 + %i130 = fadd double %i122, 0.000000e+00 + %i131 = fadd double %i123, 0.000000e+00 + %i132 = select i1 false, double 0.000000e+00, double %i131 + %i133 = fmul double %i132, 0.000000e+00 + %i134 = fmul double %i133, 0.000000e+00 + %i135 = fptosi double %i134 to i32 + %i136 = or i32 0, %i135 + %i137 = icmp slt i32 %i136, 32000 + %i138 = select i1 %i137, i32 %i136, i32 32000 + %i139 = select i1 false, double 0.000000e+00, double %i130 + %i140 = fmul double %i139, 0.000000e+00 + %i141 = fmul double %i140, 0.000000e+00 + %i142 = fptosi double %i141 to i32 + %i143 = or i32 0, %i142 + %i144 = icmp slt i32 %i143, %i138 + %i145 = select i1 %i144, i32 %i143, i32 %i138 + %i146 = select i1 false, double 0.000000e+00, double %i129 + %i147 = fmul double %i146, 0.000000e+00 + %i148 = fmul double %i147, 0.000000e+00 + %i149 = fptosi double %i148 to i32 + %i150 = or i32 0, %i149 + %i151 = icmp slt i32 %i150, %i145 + %i152 = select i1 %i151, i32 %i150, i32 %i145 + %i153 = select i1 false, double 0.000000e+00, double %i126 + %i154 = fmul double %i153, 0.000000e+00 + %i155 = fmul double %i154, 0.000000e+00 + %i156 = fptosi double %i155 to i32 + %i157 = or i32 0, %i156 + %i158 = icmp slt i32 %i157, %i152 + %i159 = select i1 %i158, i32 %i157, i32 %i152 + %i163 = fcmp ogt double %i118, 0.000000e+00 + %i164 = icmp slt i32 0, %i159 + unreachable + +bb58: + ret void +} diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll index ee35a0fd5fea65..f0e605fb4ad5c9 100644 --- a/llvm/test/Transforms/SROA/vector-promotion.ll +++ b/llvm/test/Transforms/SROA/vector-promotion.ll @@ -1388,6 +1388,68 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) { ret <4 x ptr> %sroaval } +define <4 x i32> @validLoadStoreTy([2 x i64] %cond.coerce) { +; CHECK-LABEL: @validLoadStoreTy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0 +; CHECK-NEXT: [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0 +; CHECK-NEXT: [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1 +; CHECK-NEXT: [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP0]] +; +; DEBUG-LABEL: @validLoadStoreTy( +; DEBUG-NEXT: entry: +; DEBUG-NEXT: call void @llvm.dbg.value(metadata ptr undef, metadata [[META553:![0-9]+]], metadata !DIExpression()), !dbg [[DBG557:![0-9]+]] +; DEBUG-NEXT: call void @llvm.dbg.value(metadata ptr undef, metadata [[META554:![0-9]+]], metadata !DIExpression()), !dbg [[DBG558:![0-9]+]] +; DEBUG-NEXT: [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0, !dbg [[DBG559:![0-9]+]] +; DEBUG-NEXT: [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0, !dbg [[DBG559]] +; DEBUG-NEXT: [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1, !dbg [[DBG559]] +; DEBUG-NEXT: [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1, !dbg [[DBG559]] +; DEBUG-NEXT: call void @llvm.dbg.value(metadata ptr undef, metadata [[META555:![0-9]+]], metadata !DIExpression()), !dbg [[DBG560:![0-9]+]] +; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32>, !dbg [[DBG561:![0-9]+]] +; DEBUG-NEXT: call void @llvm.dbg.value(metadata <4 x i32> [[TMP0]], metadata [[META556:![0-9]+]], metadata !DIExpression()), !dbg [[DBG561]] +; DEBUG-NEXT: ret <4 x i32> [[TMP0]], !dbg [[DBG562:![0-9]+]] +; +entry: + %cond = alloca <4 x i32>, align 8 + %coerce.dive2 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0 + store [2 x i64] %cond.coerce, ptr %coerce.dive2, align 8 + %m5 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0 + %0 = load <4 x i32>, ptr %m5, align 8 + ret <4 x i32> %0 +} + +; The following test should not crash the compiler +; (calls to CheckCandidateType from createAndCheckVectorTypesForPromotion may change the memory to hold CandidateTys.data()) +define noundef zeroext i1 @CandidateTysRealloc() personality ptr null { +entry: + %alloca = alloca <4x i32>, align 16 + store <4 x i32> , ptr %alloca, align 16 + br label %bb.1 + +bb.1: + br label %bb.1 + +bb.2: + %Load0 = load <4 x i32>, ptr %alloca, align 16 + store <4 x i32> zeroinitializer, ptr %alloca, align 16 + %Load1 = load <4 x i32>, ptr %alloca, align 16 + br label %bb.3 + +bb.3: + br label %bb.3 + +bb.4: + %Load2 = load i64, ptr %alloca, align 16 + %Load3 = load <4 x i32>, ptr %alloca, align 16 + store <4 x i32> zeroinitializer, ptr %alloca, align 16 + br label %bb.5 + +bb.5: + br label %bb.5 +} + declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) declare void @llvm.lifetime.end.p0(i64, ptr) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll b/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll index 2aaf777683e116..c6e6608d4be383 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll @@ -115,3 +115,107 @@ split: exit: ret void } + +; Variants of the above test with swapped branch destinations. + +define void @test1_swapped(i1 %c) { +; CHECK-LABEL: define void @test1_swapped( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: start: +; CHECK-NEXT: [[C_FR:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: br i1 [[C_FR]], label [[START_SPLIT_US:%.*]], label [[START_SPLIT:%.*]] +; CHECK: start.split.us: +; CHECK-NEXT: br label [[LOOP_US:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[LOOP_US]] +; CHECK: start.split: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +start: + br label %loop + +loop: + %fn = load ptr, ptr @vtable, align 8 + call void %fn() + br i1 %c, label %loop, label %exit + +exit: + ret void +} + +define void @test2_swapped(i1 %c, ptr %p) { +; CHECK-LABEL: define void @test2_swapped( +; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[C_FR:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: br i1 [[C_FR]], label [[DOTSPLIT_US:%.*]], label [[DOTSPLIT:%.*]] +; CHECK: .split.us: +; CHECK-NEXT: br label [[LOOP_US:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[LOOP_US]] +; CHECK: .split: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; + br label %loop + +loop: + %fn = load ptr, ptr @vtable, align 8 + call void %fn() + call void @bar() + br i1 %c, label %loop, label %exit + +exit: + ret void +} + +define void @test3_swapped(i1 %c, ptr %p) { +; CHECK-LABEL: define void @test3_swapped( +; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[C_FR:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: br i1 [[C_FR]], label [[DOTSPLIT_US:%.*]], label [[DOTSPLIT:%.*]] +; CHECK: .split.us: +; CHECK-NEXT: br label [[LOOP_US:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: br label [[SPLIT_US:%.*]] +; CHECK: split.us: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[LOOP_US]] +; CHECK: .split: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: br label [[SPLIT:%.*]] +; CHECK: split: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; + br label %loop + +loop: + %fn = load ptr, ptr @vtable, align 8 + br label %split + +split: + call void %fn() + call void @bar() + br i1 %c, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll index 8ccd27030afae8..7b12de90319012 100644 --- a/llvm/test/Transforms/Util/add-TLI-mappings.ll +++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll @@ -23,36 +23,30 @@ ; LIBMVEC-X86-SAME: ptr @_ZGVdN4v_sin ; SLEEFGNUABI-SAME: [16 x ptr] [ ; SLEEFGNUABI-SAME: ptr @_ZGVnN2vl8_modf, -; SLEEFGNUABI-SAME: ptr @_ZGVsMxvl8_modf, +; SLEEFGNUABI-SAME: ptr @_ZGVsNxvl8_modf, ; SLEEFGNUABI-SAME: ptr @_ZGVnN4vl4_modff, -; SLEEFGNUABI-SAME: ptr @_ZGVsMxvl4_modff, +; SLEEFGNUABI-SAME: ptr @_ZGVsNxvl4_modff, ; SLEEFGNUABI-SAME: ptr @_ZGVnN2v_sin, ; SLEEFGNUABI-SAME: ptr @_ZGVsMxv_sin, ; SLEEFGNUABI-SAME: ptr @_ZGVnN2vl8l8_sincos, -; SLEEFGNUABI-SAME: ptr @_ZGVsMxvl8l8_sincos, +; SLEEFGNUABI-SAME: ptr @_ZGVsNxvl8l8_sincos, ; SLEEFGNUABI-SAME: ptr @_ZGVnN4vl4l4_sincosf, -; SLEEFGNUABI-SAME: ptr @_ZGVsMxvl4l4_sincosf, +; SLEEFGNUABI-SAME: ptr @_ZGVsNxvl4l4_sincosf, ; SLEEFGNUABI-SAME: ptr @_ZGVnN2vl8l8_sincospi, -; SLEEFGNUABI-SAME: ptr @_ZGVsMxvl8l8_sincospi, +; SLEEFGNUABI-SAME: ptr @_ZGVsNxvl8l8_sincospi, ; SLEEFGNUABI-SAME: ptr @_ZGVnN4vl4l4_sincospif, -; SLEEFGNUABI-SAME: ptr @_ZGVsMxvl4l4_sincospif, +; SLEEFGNUABI-SAME: ptr @_ZGVsNxvl4l4_sincospif, ; SLEEFGNUABI_SAME; ptr @_ZGVnN4v_log10f, ; SLEEFGNUABI-SAME: ptr @_ZGVsMxv_log10f -; ARMPL-SAME: [16 x ptr] [ +; ARMPL-SAME: [10 x ptr] [ ; ARMPL-SAME: ptr @armpl_vmodfq_f64, -; ARMPL-SAME: ptr @armpl_svmodf_f64_x, ; ARMPL-SAME: ptr @armpl_vmodfq_f32, -; ARMPL-SAME: ptr @armpl_svmodf_f32_x, ; ARMPL-SAME: ptr @armpl_vsinq_f64, ; ARMPL-SAME: ptr @armpl_svsin_f64_x, ; ARMPL-SAME: ptr @armpl_vsincosq_f64, -; ARMPL-SAME: ptr @armpl_svsincos_f64_x, ; ARMPL-SAME: ptr @armpl_vsincosq_f32, -; ARMPL-SAME: ptr @armpl_svsincos_f32_x, ; ARMPL-SAME: ptr @armpl_vsincospiq_f64, -; ARMPL-SAME: ptr @armpl_svsincospi_f64_x, ; ARMPL-SAME: ptr @armpl_vsincospiq_f32, -; ARMPL-SAME: ptr @armpl_svsincospi_f32_x, ; ARMPL-SAME: ptr @armpl_vlog10q_f32, ; ARMPL-SAME: ptr @armpl_svlog10_f32_x ; COMMON-SAME: ], section "llvm.metadata" @@ -166,36 +160,30 @@ declare float @llvm.log10.f32(float) #0 ; ACCELERATE: declare <4 x float> @vlog10f(<4 x float>) ; SLEEFGNUABI: declare <2 x double> @_ZGVnN2vl8_modf(<2 x double>, ptr) -; SLEEFGNUABI: declare @_ZGVsMxvl8_modf(, ptr, ) +; SLEEFGNUABI: declare @_ZGVsNxvl8_modf(, ptr) ; SLEEFGNUABI: declare <4 x float> @_ZGVnN4vl4_modff(<4 x float>, ptr) -; SLEEFGNUABI: declare @_ZGVsMxvl4_modff(, ptr, ) +; SLEEFGNUABI: declare @_ZGVsNxvl4_modff(, ptr) ; SLEEFGNUABI: declare <2 x double> @_ZGVnN2v_sin(<2 x double>) ; SLEEFGNUABI: declare @_ZGVsMxv_sin(, ) ; SLEEFGNUABI: declare void @_ZGVnN2vl8l8_sincos(<2 x double>, ptr, ptr) -; SLEEFGNUABI: declare void @_ZGVsMxvl8l8_sincos(, ptr, ptr, ) +; SLEEFGNUABI: declare void @_ZGVsNxvl8l8_sincos(, ptr, ptr) ; SLEEFGNUABI: declare void @_ZGVnN4vl4l4_sincosf(<4 x float>, ptr, ptr) -; SLEEFGNUABI: declare void @_ZGVsMxvl4l4_sincosf(, ptr, ptr, ) +; SLEEFGNUABI: declare void @_ZGVsNxvl4l4_sincosf(, ptr, ptr) ; SLEEFGNUABI: declare void @_ZGVnN2vl8l8_sincospi(<2 x double>, ptr, ptr) -; SLEEFGNUABI: declare void @_ZGVsMxvl8l8_sincospi(, ptr, ptr, ) +; SLEEFGNUABI: declare void @_ZGVsNxvl8l8_sincospi(, ptr, ptr) ; SLEEFGNUABI: declare void @_ZGVnN4vl4l4_sincospif(<4 x float>, ptr, ptr) -; SLEEFGNUABI: declare void @_ZGVsMxvl4l4_sincospif(, ptr, ptr, ) +; SLEEFGNUABI: declare void @_ZGVsNxvl4l4_sincospif(, ptr, ptr) ; SLEEFGNUABI: declare <4 x float> @_ZGVnN4v_log10f(<4 x float>) ; SLEEFGNUABI: declare @_ZGVsMxv_log10f(, ) ; ARMPL: declare <2 x double> @armpl_vmodfq_f64(<2 x double>, ptr) -; ARMPL: declare @armpl_svmodf_f64_x(, ptr, ) ; ARMPL: declare <4 x float> @armpl_vmodfq_f32(<4 x float>, ptr) -; ARMPL: declare @armpl_svmodf_f32_x(, ptr, ) ; ARMPL: declare <2 x double> @armpl_vsinq_f64(<2 x double>) ; ARMPL: declare @armpl_svsin_f64_x(, ) ; ARMPL: declare void @armpl_vsincosq_f64(<2 x double>, ptr, ptr) -; ARMPL: declare void @armpl_svsincos_f64_x(, ptr, ptr, ) ; ARMPL: declare void @armpl_vsincosq_f32(<4 x float>, ptr, ptr) -; ARMPL: declare void @armpl_svsincos_f32_x(, ptr, ptr, ) ; ARMPL: declare void @armpl_vsincospiq_f64(<2 x double>, ptr, ptr) -; ARMPL: declare void @armpl_svsincospi_f64_x(, ptr, ptr, ) ; ARMPL: declare void @armpl_vsincospiq_f32(<4 x float>, ptr, ptr) -; ARMPL: declare void @armpl_svsincospi_f32_x(, ptr, ptr, ) ; ARMPL: declare <4 x float> @armpl_vlog10q_f32(<4 x float>) ; ARMPL: declare @armpl_svlog10_f32_x(, ) @@ -220,50 +208,44 @@ attributes #0 = { nounwind readnone } ; SLEEFGNUABI: attributes #[[MODF]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N2vl8_modf(_ZGVnN2vl8_modf), -; SLEEFGNUABI-SAME: _ZGVsMxvl8_modf(_ZGVsMxvl8_modf)" } +; SLEEFGNUABI-SAME: _ZGVsNxvl8_modf(_ZGVsNxvl8_modf)" } ; SLEEFGNUABI: attributes #[[MODFF]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N4vl4_modff(_ZGVnN4vl4_modff), -; SLEEFGNUABI-SAME: _ZGVsMxvl4_modff(_ZGVsMxvl4_modff)" } +; SLEEFGNUABI-SAME: _ZGVsNxvl4_modff(_ZGVsNxvl4_modff)" } ; SLEEFGNUABI: attributes #[[SIN]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N2v_sin(_ZGVnN2v_sin), ; SLEEFGNUABI-SAME: _ZGVsMxv_sin(_ZGVsMxv_sin)" } ; SLEEFGNUABI: attributes #[[SINCOS]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N2vl8l8_sincos(_ZGVnN2vl8l8_sincos), -; SLEEFGNUABI-SAME: _ZGVsMxvl8l8_sincos(_ZGVsMxvl8l8_sincos)" } +; SLEEFGNUABI-SAME: _ZGVsNxvl8l8_sincos(_ZGVsNxvl8l8_sincos)" } ; SLEEFGNUABI: attributes #[[SINCOSF]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N4vl4l4_sincosf(_ZGVnN4vl4l4_sincosf), -; SLEEFGNUABI-SAME: _ZGVsMxvl4l4_sincosf(_ZGVsMxvl4l4_sincosf)" } +; SLEEFGNUABI-SAME: _ZGVsNxvl4l4_sincosf(_ZGVsNxvl4l4_sincosf)" } ; SLEEFGNUABI: attributes #[[SINCOSPI]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N2vl8l8_sincospi(_ZGVnN2vl8l8_sincospi), -; SLEEFGNUABI-SAME: _ZGVsMxvl8l8_sincospi(_ZGVsMxvl8l8_sincospi)" } +; SLEEFGNUABI-SAME: _ZGVsNxvl8l8_sincospi(_ZGVsNxvl8l8_sincospi)" } ; SLEEFGNUABI: attributes #[[SINCOSPIF]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N4vl4l4_sincospif(_ZGVnN4vl4l4_sincospif), -; SLEEFGNUABI-SAME: _ZGVsMxvl4l4_sincospif(_ZGVsMxvl4l4_sincospif)" } +; SLEEFGNUABI-SAME: _ZGVsNxvl4l4_sincospif(_ZGVsNxvl4l4_sincospif)" } ; SLEEFGNUABI: attributes #[[LOG10]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N4v_llvm.log10.f32(_ZGVnN4v_log10f), ; SLEEFGNUABI-SAME: _ZGVsMxv_llvm.log10.f32(_ZGVsMxv_log10f)" } ; ARMPL: attributes #[[MODF]] = { "vector-function-abi-variant"= -; ARMPL-SAME: "_ZGV_LLVM_N2vl8_modf(armpl_vmodfq_f64), -; ARMPL-SAME: _ZGVsMxvl8_modf(armpl_svmodf_f64_x)" } +; ARMPL-SAME: "_ZGV_LLVM_N2vl8_modf(armpl_vmodfq_f64)" } ; ARMPL: attributes #[[MODFF]] = { "vector-function-abi-variant"= -; ARMPL-SAME: "_ZGV_LLVM_N4vl4_modff(armpl_vmodfq_f32), -; ARMPL-SAME: _ZGVsMxvl4_modff(armpl_svmodf_f32_x)" } +; ARMPL-SAME: "_ZGV_LLVM_N4vl4_modff(armpl_vmodfq_f32)" } ; ARMPL: attributes #[[SIN]] = { "vector-function-abi-variant"= ; ARMPL-SAME: "_ZGV_LLVM_N2v_sin(armpl_vsinq_f64), ; ARMPL-SAME _ZGVsMxv_sin(armpl_svsin_f64_x)" } ; ARMPL: attributes #[[SINCOS]] = { "vector-function-abi-variant"= -; ARMPL-SAME: "_ZGV_LLVM_N2vl8l8_sincos(armpl_vsincosq_f64), -; ARMPL-SAME: _ZGVsMxvl8l8_sincos(armpl_svsincos_f64_x)" } +; ARMPL-SAME: "_ZGV_LLVM_N2vl8l8_sincos(armpl_vsincosq_f64)" } ; ARMPL: attributes #[[SINCOSF]] = { "vector-function-abi-variant"= -; ARMPL-SAME: "_ZGV_LLVM_N4vl4l4_sincosf(armpl_vsincosq_f32), -; ARMPL-SAME: _ZGVsMxvl4l4_sincosf(armpl_svsincos_f32_x)" } +; ARMPL-SAME: "_ZGV_LLVM_N4vl4l4_sincosf(armpl_vsincosq_f32)" } ; ARMPL: attributes #[[SINCOSPI]] = { "vector-function-abi-variant"= -; ARMPL-SAME: "_ZGV_LLVM_N2vl8l8_sincospi(armpl_vsincospiq_f64), -; ARMPL-SAME: _ZGVsMxvl8l8_sincospi(armpl_svsincospi_f64_x)" } +; ARMPL-SAME: "_ZGV_LLVM_N2vl8l8_sincospi(armpl_vsincospiq_f64)" } ; ARMPL: attributes #[[SINCOSPIF]] = { "vector-function-abi-variant"= -; ARMPL-SAME: "_ZGV_LLVM_N4vl4l4_sincospif(armpl_vsincospiq_f32), -; ARMPL-SAME: _ZGVsMxvl4l4_sincospif(armpl_svsincospi_f32_x)" } +; ARMPL-SAME: "_ZGV_LLVM_N4vl4l4_sincospif(armpl_vsincospiq_f32)" } ; ARMPL: attributes #[[LOG10]] = { "vector-function-abi-variant"= ; ARMPL-SAME: "_ZGV_LLVM_N4v_llvm.log10.f32(armpl_vlog10q_f32), ; ARMPL-SAME _ZGVsMxv_llvm.log10.f32(armpl_svlog10_f32_x)" } diff --git a/llvm/test/Transforms/Util/flattencfg.ll b/llvm/test/Transforms/Util/flattencfg.ll index 4a4d4279f360d6..5f8dd981293345 100644 --- a/llvm/test/Transforms/Util/flattencfg.ll +++ b/llvm/test/Transforms/Util/flattencfg.ll @@ -77,13 +77,16 @@ define void @test_not_crash3(i32 %a) #0 { ; CHECK-SAME: (i32 [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_EQ_0:%.*]] = icmp eq i32 [[A]], 0 +; CHECK-NEXT: br i1 [[A_EQ_0]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: bb0: +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: ; CHECK-NEXT: [[A_EQ_1:%.*]] = icmp eq i32 [[A]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = or i1 [[A_EQ_0]], [[A_EQ_1]] -; CHECK-NEXT: br i1 [[TMP0]], label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK-NEXT: br i1 [[A_EQ_1]], label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[CHECK_BADREF:%.*]] = phi i32 [ 17, [[ENTRY:%.*]] ], [ 11, [[BB2]] ] +; CHECK-NEXT: [[CHECK_BADREF:%.*]] = phi i32 [ 17, [[BB1]] ], [ 11, [[BB2]] ] ; CHECK-NEXT: ret void ; entry: @@ -278,9 +281,9 @@ define i1 @test_cond_multi_use(i32 %x, i32 %y, i32 %z) { ; CHECK-NEXT: entry.x: ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: [[CMP_Y:%.*]] = icmp eq i32 [[Y]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[CMP_Y]], true -; CHECK-NEXT: [[TMP1:%.*]] = or i1 [[CMP_X]], [[TMP0]] -; CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN_Y:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP_Y_NOT:%.*]] = xor i1 [[CMP_Y]], true +; CHECK-NEXT: [[TMP0:%.*]] = or i1 [[CMP_X]], [[CMP_Y_NOT]] +; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN_Y:%.*]], label [[EXIT:%.*]] ; CHECK: if.then.y: ; CHECK-NEXT: store i32 [[Z]], ptr @g, align 4 ; CHECK-NEXT: br label [[EXIT]] diff --git a/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll new file mode 100644 index 00000000000000..0a43ad2f9a3684 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv32 -mattr=+v | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv64 -mattr=+v | FileCheck %s + +define void @fixed_load_scalable_src(ptr %p) { +; CHECK-LABEL: define void @fixed_load_scalable_src( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store zeroinitializer, ptr [[P]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> zeroinitializer, <8 x i32> +; CHECK-NEXT: ret void +; +entry: + store zeroinitializer, ptr %p + %0 = load <4 x i16>, ptr %p + %1 = shufflevector <4 x i16> %0, <4 x i16> zeroinitializer, <8 x i32> + ret void +} diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index c6f9ee82e08cc1..74e7769a480598 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -414,10 +414,11 @@ def version_int(ver): config.available_features.add("llvm-dylib") config.substitutions.append( ( + # libLLVM.so.19.0git "%llvmdylib", - "{}/libLLVM-{}{}".format( - config.llvm_shlib_dir, config.llvm_dylib_version, config.llvm_shlib_ext - ), + "{}/libLLVM{}.{}".format( + config.llvm_shlib_dir, config.llvm_shlib_ext, config.llvm_dylib_version + ) ) ) diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 1138b2ccf7bce7..b6f255d472d16f 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -44,7 +44,7 @@ config.build_examples = @LLVM_BUILD_EXAMPLES@ config.enable_threads = @LLVM_ENABLE_THREADS@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ -config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@@LLVM_VERSION_SUFFIX@" +config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@.@LLVM_VERSION_MINOR@@LLVM_VERSION_SUFFIX@" config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' config.host_arch = "@HOST_ARCH@" config.have_opt_viewer_modules = @LLVM_HAVE_OPT_VIEWER_MODULES@ diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.c b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.c new file mode 100644 index 00000000000000..bd2b979bd257f7 --- /dev/null +++ b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.c @@ -0,0 +1,20 @@ +#define C c +#define D 1 +#define E (C != a) && (C > a) +#define F E + +void __attribute__((noinline)) func1(void) { return; } + +void __attribute__((noinline)) func(int a, int b, int c) { + if (a && D && E || b) + func1(); + if (b && D) + func1(); + if (a && (b && C) || (D && F)) + func1(); +} + +int main() { + func(2, 3, 3); + return 0; +} diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o new file mode 100644 index 00000000000000..667ccd132d2fb8 Binary files /dev/null and b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o differ diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.proftext b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.proftext new file mode 100644 index 00000000000000..35ecc42b5802a6 --- /dev/null +++ b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.proftext @@ -0,0 +1,62 @@ +func +# Func Hash: +395201011017399473 +# Num Counters: +22 +# Counter Values: +1 +1 +0 +0 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +0 +1 +1 +1 +0 +0 +0 +0 +# Num Bitmap Bytes: +$13 +# Bitmap Byte Values: +0x0 +0x0 +0x0 +0x20 +0x8 +0x0 +0x20 +0x0 +0x0 +0x0 +0x0 +0x0 +0x0 + + +func1 +# Func Hash: +24 +# Num Counters: +1 +# Counter Values: +3 + +main +# Func Hash: +24 +# Num Counters: +1 +# Counter Values: +1 + diff --git a/llvm/test/tools/llvm-cov/mcdc-general-none.test b/llvm/test/tools/llvm-cov/mcdc-general-none.test index bcf8f3cbd05d45..a373075cc5e37c 100644 --- a/llvm/test/tools/llvm-cov/mcdc-general-none.test +++ b/llvm/test/tools/llvm-cov/mcdc-general-none.test @@ -52,7 +52,7 @@ // Test html output. // RUN: llvm-cov show --show-mcdc-summary --show-mcdc %S/Inputs/mcdc-general.o -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs %S/Inputs/mcdc-general.cpp -format html -o %t.html.dir // RUN: FileCheck -check-prefix=HTML -input-file=%t.html.dir/coverage/tmp/mcdc-general.cpp.html %s -// HTML-COUNT-4: MC/DC Decision Region ( +// HTML-COUNT-4: MC/DC Decision Region ( // RUN: FileCheck -check-prefix HTML-INDEX -input-file %t.html.dir/index.html %s // HTML-INDEX-LABEL: diff --git a/llvm/test/tools/llvm-cov/mcdc-general.test b/llvm/test/tools/llvm-cov/mcdc-general.test index 588aed09c16a5e..ded2f3eb1c9a5d 100644 --- a/llvm/test/tools/llvm-cov/mcdc-general.test +++ b/llvm/test/tools/llvm-cov/mcdc-general.test @@ -118,7 +118,7 @@ // Test html output. // RUN: llvm-cov show --show-mcdc-summary --show-mcdc %S/Inputs/mcdc-general.o -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs %S/Inputs/mcdc-general.cpp -format html -o %t.html.dir // RUN: FileCheck -check-prefix=HTML -input-file=%t.html.dir/coverage/tmp/mcdc-general.cpp.html %s -// HTML-COUNT-4: MC/DC Decision Region ( +// HTML-COUNT-4: MC/DC Decision Region ( // RUN: FileCheck -check-prefix HTML-INDEX -input-file %t.html.dir/index.html %s // HTML-INDEX-LABEL:
diff --git a/llvm/test/tools/llvm-cov/mcdc-macro.test b/llvm/test/tools/llvm-cov/mcdc-macro.test new file mode 100644 index 00000000000000..339284bba2c9bd --- /dev/null +++ b/llvm/test/tools/llvm-cov/mcdc-macro.test @@ -0,0 +1,99 @@ +// Test visualization of MC/DC constructs for branches in macro expansions. + +// RUN: llvm-profdata merge %S/Inputs/mcdc-macro.proftext -o %t.profdata +// RUN: llvm-cov show --show-expansions --show-branches=count --show-mcdc %S/Inputs/mcdc-macro.o -instr-profile %t.profdata --compilation-dir=%S/Inputs | FileCheck %s + +// CHECK: | | | Branch (2:11): [Folded - Ignored] +// CHECK: | | | Branch (3:11): [True: 1, False: 0] +// CHECK: | | | Branch (3:23): [True: 1, False: 0] +// CHECK: | Branch (9:7): [True: 1, False: 0] +// CHECK-NEXT: | Branch (9:22): [True: 0, False: 0] +// CHECK-NEXT: ------------------ +// CHECK-NEXT: |---> MC/DC Decision Region (9:7) to (9:23) +// CHECK-NEXT: | +// CHECK-NEXT: | Number of Conditions: 5 +// CHECK-NEXT: | Condition C1 --> (9:7) +// CHECK-NEXT: | Condition C2 --> (9:22) +// CHECK-NEXT: | Condition C3 --> (2:11) +// CHECK-NEXT: | Condition C4 --> (3:11) +// CHECK-NEXT: | Condition C5 --> (3:23) +// CHECK-NEXT: | +// CHECK-NEXT: | Executed MC/DC Test Vectors: +// CHECK-NEXT: | +// CHECK-NEXT: | C1, C2, C3, C4, C5 Result +// CHECK-NEXT: | 1 { T, -, C, T, T = T } +// CHECK-NEXT: | +// CHECK-NEXT: | C1-Pair: not covered +// CHECK-NEXT: | C2-Pair: not covered +// CHECK-NEXT: | C3-Pair: constant folded +// CHECK-NEXT: | C4-Pair: not covered +// CHECK-NEXT: | C5-Pair: not covered +// CHECK-NEXT: | MC/DC Coverage for Decision: 0.00% +// CHECK-NEXT: | +// CHECK-NEXT: ------------------ + +// CHECK: | | | Branch (2:11): [Folded - Ignored] +// CHECK: | Branch (11:7): [True: 1, False: 0] +// CHECK-NEXT: ------------------ +// CHECK-NEXT: |---> MC/DC Decision Region (11:7) to (11:13) +// CHECK-NEXT: | +// CHECK-NEXT: | Number of Conditions: 2 +// CHECK-NEXT: | Condition C1 --> (11:7) +// CHECK-NEXT: | Condition C2 --> (2:11) +// CHECK-NEXT: | +// CHECK-NEXT: | Executed MC/DC Test Vectors: +// CHECK-NEXT: | +// CHECK-NEXT: | C1, C2 Result +// CHECK-NEXT: | 1 { T, C = T } +// CHECK-NEXT: | +// CHECK-NEXT: | C1-Pair: not covered +// CHECK-NEXT: | C2-Pair: constant folded +// CHECK-NEXT: | MC/DC Coverage for Decision: 0.00% +// CHECK-NEXT: | +// CHECK-NEXT: ------------------ + +// CHECK: | | | Branch (1:11): [True: 1, False: 0] +// CHECK: | | | Branch (2:11): [Folded - Ignored] +// CHECK: | | | | | Branch (3:11): [True: 0, False: 0] +// CHECK: | | | | | Branch (3:23): [True: 0, False: 0] +// CHECK: | Branch (13:7): [True: 1, False: 0] +// CHECK-NEXT: | Branch (13:13): [True: 1, False: 0] +// CHECK-NEXT: ------------------ +// CHECK-NEXT: |---> MC/DC Decision Region (13:7) to (13:32) +// CHECK-NEXT: | +// CHECK-NEXT: | Number of Conditions: 6 +// CHECK-NEXT: | Condition C1 --> (13:7) +// CHECK-NEXT: | Condition C2 --> (13:13) +// CHECK-NEXT: | Condition C3 --> (1:11) +// CHECK-NEXT: | Condition C4 --> (2:11) +// CHECK-NEXT: | Condition C5 --> (3:11) +// CHECK-NEXT: | Condition C6 --> (3:23) +// CHECK-NEXT: | +// CHECK-NEXT: | Executed MC/DC Test Vectors: +// CHECK-NEXT: | +// CHECK-NEXT: | C1, C2, C3, C4, C5, C6 Result +// CHECK-NEXT: | 1 { T, T, T, C, -, - = T } +// CHECK-NEXT: | +// CHECK-NEXT: | C1-Pair: not covered +// CHECK-NEXT: | C2-Pair: not covered +// CHECK-NEXT: | C3-Pair: not covered +// CHECK-NEXT: | C4-Pair: constant folded +// CHECK-NEXT: | C5-Pair: not covered +// CHECK-NEXT: | C6-Pair: not covered +// CHECK-NEXT: | MC/DC Coverage for Decision: 0.00% +// CHECK-NEXT: | +// CHECK-NEXT: ------------------ + +Instructions for regenerating the test: + +cd %S/Inputs # Or copy mcdc-macro.c into the working directory + +clang -fcoverage-mcdc -fprofile-instr-generate -fcoverage-compilation-dir=. \ + -O3 -mllvm -enable-name-compression=false \ + -fcoverage-mapping mcdc-macro.c -c + +# Instructions for generating proftext +clang -fprofile-instr-generate mcdc-macro.o +./a.out +llvm-profdata merge --sparse -o default.profdata default.profraw +llvm-profdata merge --text -o mcdc-macro.proftext default.profdata diff --git a/llvm/test/tools/llvm-dlltool/coff-decorated.def b/llvm/test/tools/llvm-dlltool/coff-decorated.def index 856804686168b1..fc81f23d09d6c4 100644 --- a/llvm/test/tools/llvm-dlltool/coff-decorated.def +++ b/llvm/test/tools/llvm-dlltool/coff-decorated.def @@ -14,25 +14,32 @@ OtherStdcallExportName@4=CdeclInternalFunction CdeclExportName=StdcallInternalFunction@4 ; CHECK: Name type: noprefix +; CHECK-NEXT: Export name: CdeclFunction ; CHECK-NEXT: Symbol: __imp__CdeclFunction ; CHECK-NEXT: Symbol: _CdeclFunction ; CHECK: Name type: undecorate +; CHECK-NEXT: Export name: StdcallFunction ; CHECK-NEXT: Symbol: __imp__StdcallFunction@4 ; CHECK-NEXT: Symbol: _StdcallFunction@4 ; CHECK: Name type: undecorate +; CHECK-NEXT: Export name: FastcallFunction ; CHECK-NEXT: Symbol: __imp_@FastcallFunction@4 ; CHECK-NEXT: Symbol: @FastcallFunction@4 ; CHECK: Name type: name +; CHECK-NEXT: Export name: ??_7exception@@6B@ ; CHECK-NEXT: Symbol: __imp_??_7exception@@6B@ ; CHECK-NEXT: Symbol: ??_7exception@@6B@ ; CHECK-NM: W _StdcallAlias@4 ; CHECK-NM: U _StdcallFunction@4 ; CHECK: Name type: undecorate +; CHECK-NEXT: Export name: StdcallExportName ; CHECK-NEXT: Symbol: __imp__StdcallExportName@4{{$}} ; CHECK-NEXT: Symbol: _StdcallExportName@4{{$}} ; CHECK: Name type: undecorate +; CHECK-NEXT: Export name: OtherStdcallExportName ; CHECK-NEXT: Symbol: __imp__OtherStdcallExportName@4{{$}} ; CHECK-NEXT: Symbol: _OtherStdcallExportName@4{{$}} ; CHECK: Name type: noprefix +; CHECK-NEXT: Export name: CdeclExportName ; CHECK-NEXT: Symbol: __imp__CdeclExportName ; CHECK-NEXT: Symbol: _CdeclExportName diff --git a/llvm/test/tools/llvm-dlltool/coff-exports.def b/llvm/test/tools/llvm-dlltool/coff-exports.def index 57c55744602156..267424db1b8c1d 100644 --- a/llvm/test/tools/llvm-dlltool/coff-exports.def +++ b/llvm/test/tools/llvm-dlltool/coff-exports.def @@ -17,12 +17,15 @@ AnotherFunction ; CHECK-ARM64: Format: COFF-import-file-ARM64 ; CHECK: Type: code ; CHECK: Name type: name +; CHECK-NEXT: Export name: TestFunction1 ; CHECK-NEXT: Symbol: __imp_TestFunction1 ; CHECK-NEXT: Symbol: TestFunction1 ; CHECK: Name type: name +; CHECK-NEXT: Export name: TestFunction2 ; CHECK-NEXT: Symbol: __imp_TestFunction2{{$}} ; CHECK-NEXT: Symbol: TestFunction2{{$}} ; CHECK: Name type: name +; CHECK-NEXT: Export name: TestFunction3 ; CHECK-NEXT: Symbol: __imp_TestFunction3{{$}} ; CHECK-NEXT: Symbol: TestFunction3{{$}} diff --git a/llvm/test/tools/llvm-dlltool/coff-noname.def b/llvm/test/tools/llvm-dlltool/coff-noname.def index 27e60efbd2d802..7cb05846ce28a2 100644 --- a/llvm/test/tools/llvm-dlltool/coff-noname.def +++ b/llvm/test/tools/llvm-dlltool/coff-noname.def @@ -12,5 +12,6 @@ ByNameFunction ; CHECK-NEXT: Symbol: __imp__ByOrdinalFunction ; CHECK-NEXT: Symbol: _ByOrdinalFunction ; CHECK: Name type: noprefix +; CHECK-NEXT: Export name: ByNameFunction ; CHECK-NEXT: Symbol: __imp__ByNameFunction ; CHECK-NEXT: Symbol: _ByNameFunction diff --git a/llvm/test/tools/llvm-dlltool/no-leading-underscore.def b/llvm/test/tools/llvm-dlltool/no-leading-underscore.def index 6b78e15d2b5f69..9c5e77ca29a821 100644 --- a/llvm/test/tools/llvm-dlltool/no-leading-underscore.def +++ b/llvm/test/tools/llvm-dlltool/no-leading-underscore.def @@ -9,9 +9,11 @@ alias == func DecoratedFunction@4 ; CHECK: Name type: name +; CHECK-NEXT: Export name: func ; CHECK-NEXT: Symbol: __imp_func ; CHECK-NEXT: Symbol: func ; CHECK: Name type: undecorate +; CHECK-NEXT: Export name: DecoratedFunction ; CHECK-NEXT: Symbol: __imp_DecoratedFunction@4 ; CHECK-NEXT: Symbol: DecoratedFunction@4 diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test index 2672f8d38b7f70..ebc1b166ee4ea6 100644 --- a/llvm/test/tools/llvm-lib/arm64ec-implib.test +++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test @@ -11,9 +11,26 @@ ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll ARMAP-EMPTY: ARMAP-NEXT: Archive EC map +ARMAP-NEXT: #expname in test.dll +ARMAP-NEXT: #funcexp in test.dll +ARMAP-NEXT: #mangledfunc in test.dll +ARMAP-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll +ARMAP-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll +ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll +ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll +ARMAP-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll +ARMAP-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll +ARMAP-NEXT: __imp_aux_expname in test.dll +ARMAP-NEXT: __imp_aux_funcexp in test.dll +ARMAP-NEXT: __imp_aux_mangledfunc in test.dll ARMAP-NEXT: __imp_dataexp in test.dll +ARMAP-NEXT: __imp_expname in test.dll ARMAP-NEXT: __imp_funcexp in test.dll +ARMAP-NEXT: __imp_mangledfunc in test.dll +ARMAP-NEXT: expname in test.dll ARMAP-NEXT: funcexp in test.dll +ARMAP-NEXT: mangledfunc in test.dll +ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s @@ -35,14 +52,48 @@ READOBJ-EMPTY: READOBJ-NEXT: File: test.dll READOBJ-NEXT: Format: COFF-import-file-ARM64EC READOBJ-NEXT: Type: code -READOBJ-NEXT: Name type: name +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: funcexp READOBJ-NEXT: Symbol: __imp_funcexp READOBJ-NEXT: Symbol: funcexp +READOBJ-NEXT: Symbol: __imp_aux_funcexp +READOBJ-NEXT: Symbol: #funcexp +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-ARM64EC +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: mangledfunc +READOBJ-NEXT: Symbol: __imp_mangledfunc +READOBJ-NEXT: Symbol: mangledfunc +READOBJ-NEXT: Symbol: __imp_aux_mangledfunc +READOBJ-NEXT: Symbol: #mangledfunc +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-ARM64EC +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: __imp_aux_?test_cpp_func@@YAHPEAX@Z +READOBJ-NEXT: Symbol: ?test_cpp_func@@$$hYAHPEAX@Z +READOBJ-EMPTY: +READOBJ-NEXT: File: test.dll +READOBJ-NEXT: Format: COFF-import-file-ARM64EC +READOBJ-NEXT: Type: code +READOBJ-NEXT: Name type: export as +READOBJ-NEXT: Export name: expname +READOBJ-NEXT: Symbol: __imp_expname +READOBJ-NEXT: Symbol: expname +READOBJ-NEXT: Symbol: __imp_aux_expname +READOBJ-NEXT: Symbol: #expname READOBJ-EMPTY: READOBJ-NEXT: File: test.dll READOBJ-NEXT: Format: COFF-import-file-ARM64EC READOBJ-NEXT: Type: data READOBJ-NEXT: Name type: name +READOBJ-NEXT: Export name: dataexp READOBJ-NEXT: Symbol: __imp_dataexp Creating a new lib containing the existing lib: @@ -53,4 +104,7 @@ RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s LIBRARY test.dll EXPORTS funcexp + #mangledfunc + ?test_cpp_func@@YAHPEAX@Z + expname=impname dataexp DATA diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/basic-instructions.s new file mode 100644 index 00000000000000..7dd05eb50085c8 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/basic-instructions.s @@ -0,0 +1,3724 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s + +#------------------------------------------------------------------------------ +# Add/sub (immediate) +#------------------------------------------------------------------------------ + +add w2, w3, #4095 +add w30, w29, #1, lsl #12 +add w13, w5, #4095, lsl #12 +add x5, x7, #1638 +add w20, wsp, #801 +add wsp, wsp, #1104 +add wsp, w30, #4084 +add x0, x24, #291 +add x3, x24, #4095, lsl #12 +add x8, sp, #1074 +add sp, x29, #3816 +sub w0, wsp, #4077 +sub w4, w20, #546, lsl #12 +sub sp, sp, #288 +sub wsp, w19, #16 +adds w13, w23, #291, lsl #12 +cmn w2, #4095 +adds w20, wsp, #0 +cmn x3, #1, lsl #12 +cmp sp, #20, lsl #12 +cmp x30, #4095 +subs x4, sp, #3822 +cmn w3, #291, lsl #12 +cmn wsp, #1365 +cmn sp, #1092, lsl #12 +mov sp, x30 +mov wsp, w20 +mov x11, sp +mov w24, wsp + +#------------------------------------------------------------------------------ +# Add-subtract (shifted register) +#------------------------------------------------------------------------------ + +add w3, w5, w7 +add wzr, w3, w5 +add w20, wzr, w4 +add w4, w6, wzr +add w11, w13, w15 +add w9, w3, wzr, lsl #10 +add w17, w29, w20, lsl #31 +add w21, w22, w23, lsr #0 +add w24, w25, w26, lsr #18 +add w27, w28, w29, lsr #31 +add w2, w3, w4, asr #0 +add w5, w6, w7, asr #21 +add w8, w9, w10, asr #31 +add x3, x5, x7 +add xzr, x3, x5 +add x20, xzr, x4 +add x4, x6, xzr +add x11, x13, x15 +add x9, x3, xzr, lsl #10 +add x17, x29, x20, lsl #63 +add x21, x22, x23, lsr #0 +add x24, x25, x26, lsr #18 +add x27, x28, x29, lsr #63 +add x2, x3, x4, asr #0 +add x5, x6, x7, asr #21 +add x8, x9, x10, asr #63 +adds w3, w5, w7 +cmn w3, w5 +adds w20, wzr, w4 +adds w4, w6, wzr +adds w11, w13, w15 +adds w9, w3, wzr, lsl #10 +adds w17, w29, w20, lsl #31 +adds w21, w22, w23, lsr #0 +adds w24, w25, w26, lsr #18 +adds w27, w28, w29, lsr #31 +adds w2, w3, w4, asr #0 +adds w5, w6, w7, asr #21 +adds w8, w9, w10, asr #31 +adds x3, x5, x7 +cmn x3, x5 +adds x20, xzr, x4 +adds x4, x6, xzr +adds x11, x13, x15 +adds x9, x3, xzr, lsl #10 +adds x17, x29, x20, lsl #63 +adds x21, x22, x23, lsr #0 +adds x24, x25, x26, lsr #18 +adds x27, x28, x29, lsr #63 +adds x2, x3, x4, asr #0 +adds x5, x6, x7, asr #21 +adds x8, x9, x10, asr #63 +sub w3, w5, w7 +sub wzr, w3, w5 +sub w4, w6, wzr +sub w11, w13, w15 +sub w9, w3, wzr, lsl #10 +sub w17, w29, w20, lsl #31 +sub w21, w22, w23, lsr #0 +sub w24, w25, w26, lsr #18 +sub w27, w28, w29, lsr #31 +sub w2, w3, w4, asr #0 +sub w5, w6, w7, asr #21 +sub w8, w9, w10, asr #31 +sub x3, x5, x7 +sub xzr, x3, x5 +sub x4, x6, xzr +sub x11, x13, x15 +sub x9, x3, xzr, lsl #10 +sub x17, x29, x20, lsl #63 +sub x21, x22, x23, lsr #0 +sub x24, x25, x26, lsr #18 +sub x27, x28, x29, lsr #63 +sub x2, x3, x4, asr #0 +sub x5, x6, x7, asr #21 +sub x8, x9, x10, asr #63 +subs w3, w5, w7 +cmp w3, w5 +subs w4, w6, wzr +subs w11, w13, w15 +subs w9, w3, wzr, lsl #10 +subs w17, w29, w20, lsl #31 +subs w21, w22, w23, lsr #0 +subs w24, w25, w26, lsr #18 +subs w27, w28, w29, lsr #31 +subs w2, w3, w4, asr #0 +subs w5, w6, w7, asr #21 +subs w8, w9, w10, asr #31 +subs x3, x5, x7 +cmp x3, x5 +subs x4, x6, xzr +subs x11, x13, x15 +subs x9, x3, xzr, lsl #10 +subs x17, x29, x20, lsl #63 +subs x21, x22, x23, lsr #0 +subs x24, x25, x26, lsr #18 +subs x27, x28, x29, lsr #63 +subs x2, x3, x4, asr #0 +subs x5, x6, x7, asr #21 +subs x8, x9, x10, asr #63 +cmn wzr, w4 +cmn w5, wzr +cmn w6, w7 +cmn w8, w9, lsl #15 +cmn w10, w11, lsl #31 +cmn w12, w13, lsr #0 +cmn w14, w15, lsr #21 +cmn w16, w17, lsr #31 +cmn w18, w19, asr #0 +cmn w20, w21, asr #22 +cmn w22, w23, asr #31 +cmn x0, x3 +cmn xzr, x4 +cmn x5, xzr +cmn x6, x7 +cmn x8, x9, lsl #15 +cmn x10, x11, lsl #63 +cmn x12, x13, lsr #0 +cmn x14, x15, lsr #41 +cmn x16, x17, lsr #63 +cmn x18, x19, asr #0 +cmn x20, x21, asr #55 +cmn x22, x23, asr #63 +cmp w0, w3 +cmp wzr, w4 +cmp w5, wzr +cmp w6, w7 +cmp w8, w9, lsl #15 +cmp w10, w11, lsl #31 +cmp w12, w13, lsr #0 +cmp w14, w15, lsr #21 +cmp w18, w19, asr #0 +cmp w20, w21, asr #22 +cmp w22, w23, asr #31 +cmp x0, x3 +cmp xzr, x4 +cmp x5, xzr +cmp x6, x7 +cmp x8, x9, lsl #15 +cmp x10, x11, lsl #63 +cmp x12, x13, lsr #0 +cmp x14, x15, lsr #41 +cmp x16, x17, lsr #63 +cmp x18, x19, asr #0 +cmp x20, x21, asr #55 +cmp x22, x23, asr #63 +cmp wzr, w0 +cmp xzr, x0 + +#------------------------------------------------------------------------------ +# Add-subtract (shifted register) +#------------------------------------------------------------------------------ + +adc w29, w27, w25 +adc wzr, w3, w4 +adc w9, wzr, w10 +adc w20, w0, wzr +adc x29, x27, x25 +adc xzr, x3, x4 +adc x9, xzr, x10 +adc x20, x0, xzr +adcs w29, w27, w25 +adcs wzr, w3, w4 +adcs w9, wzr, w10 +adcs w20, w0, wzr +adcs x29, x27, x25 +adcs xzr, x3, x4 +adcs x9, xzr, x10 +adcs x20, x0, xzr +sbc w29, w27, w25 +sbc wzr, w3, w4 +ngc w9, w10 +sbc w20, w0, wzr +sbc x29, x27, x25 +sbc xzr, x3, x4 +ngc x9, x10 +sbc x20, x0, xzr +sbcs w29, w27, w25 +sbcs wzr, w3, w4 +ngcs w9, w10 +sbcs w20, w0, wzr +sbcs x29, x27, x25 +sbcs xzr, x3, x4 +ngcs x9, x10 +sbcs x20, x0, xzr +ngc w3, w12 +ngc wzr, w9 +ngc w23, wzr +ngc x29, x30 +ngc xzr, x0 +ngc x0, xzr +ngcs w3, w12 +ngcs wzr, w9 +ngcs w23, wzr +ngcs x29, x30 +ngcs xzr, x0 +ngcs x0, xzr + +#------------------------------------------------------------------------------ +# Compare and branch (immediate) +#------------------------------------------------------------------------------ + +sbfx x1, x2, #3, #2 +asr x3, x4, #63 +asr wzr, wzr, #31 +sbfx w12, w9, #0, #1 +ubfiz x4, x5, #52, #11 +ubfx xzr, x4, #0, #1 +ubfiz x4, xzr, #1, #6 +lsr x5, x6, #12 +bfi x4, x5, #52, #11 +bfxil xzr, x4, #0, #1 +bfi x4, xzr, #1, #6 +bfxil x5, x6, #12, #52 +sxtb w1, w2 +sxtb xzr, w3 +sxth w9, w10 +sxth x0, w1 +sxtw x3, w30 +uxtb w1, w2 +uxth w9, w10 +ubfx x3, x30, #0, #32 +asr w3, w2, #0 +asr w9, w10, #31 +asr x20, x21, #63 +asr w1, wzr, #3 +lsr w3, w2, #0 +lsr w9, w10, #31 +lsr x20, x21, #63 +lsr wzr, wzr, #3 +lsr w3, w2, #0 +lsl w9, w10, #31 +lsl x20, x21, #63 +lsl w1, wzr, #3 +sbfx w9, w10, #0, #1 +sbfiz x2, x3, #63, #1 +asr x19, x20, #0 +sbfiz x9, x10, #5, #59 +asr w9, w10, #0 +sbfiz w11, w12, #31, #1 +sbfiz w13, w14, #29, #3 +sbfiz xzr, xzr, #10, #11 +sbfx w9, w10, #0, #1 +asr x2, x3, #63 +asr x19, x20, #0 +asr x9, x10, #5 +asr w9, w10, #0 +asr w11, w12, #31 +asr w13, w14, #29 +sbfx xzr, xzr, #10, #11 +bfxil w9, w10, #0, #1 +bfi x2, x3, #63, #1 +bfxil x19, x20, #0, #64 +bfi x9, x10, #5, #59 +bfxil w9, w10, #0, #32 +bfi w11, w12, #31, #1 +bfi w13, w14, #29, #3 +bfi xzr, xzr, #10, #11 +bfxil w9, w10, #0, #1 +bfxil x2, x3, #63, #1 +bfxil x19, x20, #0, #64 +bfxil x9, x10, #5, #59 +bfxil w9, w10, #0, #32 +bfxil w11, w12, #31, #1 +bfxil w13, w14, #29, #3 +bfxil xzr, xzr, #10, #11 +ubfx w9, w10, #0, #1 +lsl x2, x3, #63 +lsr x19, x20, #0 +lsl x9, x10, #5 +lsr w9, w10, #0 +lsl w11, w12, #31 +lsl w13, w14, #29 +ubfiz xzr, xzr, #10, #11 +ubfx w9, w10, #0, #1 +lsr x2, x3, #63 +lsr x19, x20, #0 +lsr x9, x10, #5 +lsr w9, w10, #0 +lsr w11, w12, #31 +lsr w13, w14, #29 +ubfx xzr, xzr, #10, #11 + +#------------------------------------------------------------------------------ +# Compare and branch (immediate) +#------------------------------------------------------------------------------ + +cbz w5, #4 +cbz x5, #0 +cbnz x2, #-4 +cbnz x26, #1048572 +cbz wzr, #0 +cbnz xzr, #0 + +#------------------------------------------------------------------------------ +# Conditional branch (immediate) +#------------------------------------------------------------------------------ + +b.ne #4 +b.ge #1048572 +b.ge #-4 + +#------------------------------------------------------------------------------ +# Conditional compare (immediate) +#------------------------------------------------------------------------------ + +ccmp w1, #31, #0, eq +ccmp w3, #0, #15, hs +ccmp wzr, #15, #13, hs +ccmp x9, #31, #0, le +ccmp x3, #0, #15, gt +ccmp xzr, #5, #7, ne +ccmn w1, #31, #0, eq +ccmn w3, #0, #15, hs +ccmn wzr, #15, #13, hs +ccmn x9, #31, #0, le +ccmn x3, #0, #15, gt +ccmn xzr, #5, #7, ne + +#------------------------------------------------------------------------------ +# Conditional compare (register) +#------------------------------------------------------------------------------ + +ccmp w1, wzr, #0, eq +ccmp w3, w0, #15, hs +ccmp wzr, w15, #13, hs +ccmp x9, xzr, #0, le +ccmp x3, x0, #15, gt +ccmp xzr, x5, #7, ne +ccmn w1, wzr, #0, eq +ccmn w3, w0, #15, hs +ccmn wzr, w15, #13, hs +ccmn x9, xzr, #0, le +ccmn x3, x0, #15, gt +ccmn xzr, x5, #7, ne + +#------------------------------------------------------------------------------ +# Conditional branch (immediate) +#------------------------------------------------------------------------------ + +csel w1, w0, w19, ne +csel wzr, w5, w9, eq +csel w9, wzr, w30, gt +csel w1, w28, wzr, mi +csel x19, x23, x29, lt +csel xzr, x3, x4, ge +csel x5, xzr, x6, hs +csel x7, x8, xzr, lo +csinc w1, w0, w19, ne +csinc wzr, w5, w9, eq +csinc w9, wzr, w30, gt +csinc w1, w28, wzr, mi +csinc x19, x23, x29, lt +csinc xzr, x3, x4, ge +csinc x5, xzr, x6, hs +csinc x7, x8, xzr, lo +csinv w1, w0, w19, ne +csinv wzr, w5, w9, eq +csinv w9, wzr, w30, gt +csinv w1, w28, wzr, mi +csinv x19, x23, x29, lt +csinv xzr, x3, x4, ge +csinv x5, xzr, x6, hs +csinv x7, x8, xzr, lo +csneg w1, w0, w19, ne +csneg wzr, w5, w9, eq +csneg w9, wzr, w30, gt +csneg w1, w28, wzr, mi +csneg x19, x23, x29, lt +csneg xzr, x3, x4, ge +csneg x5, xzr, x6, hs +csneg x7, x8, xzr, lo +cset w3, eq +cset x9, pl +csetm w20, ne +csetm x30, ge +csinc w2, wzr, wzr, al +csinv x3, xzr, xzr, nv +cinc w3, w5, gt +cinc wzr, w4, le +cset w9, lt +cinc x3, x5, gt +cinc xzr, x4, le +cset x9, lt +csinc w5, w6, w6, nv +csinc x1, x2, x2, al +cinv w3, w5, gt +cinv wzr, w4, le +csetm w9, lt +cinv x3, x5, gt +cinv xzr, x4, le +csetm x9, lt +csinv x1, x0, x0, al +csinv w9, w8, w8, nv +cneg w3, w5, gt +cneg wzr, w4, le +cneg w9, wzr, lt +cneg x3, x5, gt +cneg xzr, x4, le +cneg x9, xzr, lt +csneg x4, x8, x8, al +csinv w9, w8, w8, nv + +#------------------------------------------------------------------------------ +# Data-processing (1 source) +#------------------------------------------------------------------------------ + +rbit w0, w7 +rbit x18, x3 +rev16 w17, w1 +rev16 x5, x2 +rev w18, w0 +rev32 x20, x1 +rev x22, x2 +clz w24, w3 +clz x26, x4 +cls w3, w5 +cls x20, x5 + +#------------------------------------------------------------------------------ +# Data-processing (2 source) +#------------------------------------------------------------------------------ + +udiv w0, w7, w10 +udiv x9, x22, x4 +sdiv w12, w21, w0 +sdiv x13, x2, x1 +lsl w11, w12, w13 +lsl x14, x15, x16 +lsr w17, w18, w19 +lsr x20, x21, x22 +asr w23, w24, w25 +asr x26, x27, x28 +ror w0, w1, w2 +ror x3, x4, x5 +lsl w6, w7, w8 +lsl x9, x10, x11 +lsr w12, w13, w14 +lsr x15, x16, x17 +asr w18, w19, w20 +asr x21, x22, x23 +ror w24, w25, w26 +ror x27, x28, x29 + +#------------------------------------------------------------------------------ +# Data-processing (3 sources) +#------------------------------------------------------------------------------ + +smulh x30, x29, x28 +smulh xzr, x27, x26 +umulh x30, x29, x28 +umulh x23, x30, xzr +madd w1, w3, w7, w4 +madd wzr, w0, w9, w11 +madd w13, wzr, w4, w4 +madd w19, w30, wzr, w29 +mul w4, w5, w6 +madd x1, x3, x7, x4 +madd xzr, x0, x9, x11 +madd x13, xzr, x4, x4 +madd x19, x30, xzr, x29 +mul x4, x5, x6 +msub w1, w3, w7, w4 +msub wzr, w0, w9, w11 +msub w13, wzr, w4, w4 +msub w19, w30, wzr, w29 +mneg w4, w5, w6 +msub x1, x3, x7, x4 +msub xzr, x0, x9, x11 +msub x13, xzr, x4, x4 +msub x19, x30, xzr, x29 +mneg x4, x5, x6 +smaddl x3, w5, w2, x9 +smaddl xzr, w10, w11, x12 +smaddl x13, wzr, w14, x15 +smaddl x16, w17, wzr, x18 +smull x19, w20, w21 +smsubl x3, w5, w2, x9 +smsubl xzr, w10, w11, x12 +smsubl x13, wzr, w14, x15 +smsubl x16, w17, wzr, x18 +smnegl x19, w20, w21 +umaddl x3, w5, w2, x9 +umaddl xzr, w10, w11, x12 +umaddl x13, wzr, w14, x15 +umaddl x16, w17, wzr, x18 +umull x19, w20, w21 +umsubl x3, w5, w2, x9 +umsubl x16, w17, wzr, x18 +umnegl x19, w20, w21 +smulh x30, x29, x28 +smulh x23, x22, xzr +umulh x23, x22, xzr +mul x19, x20, xzr +mneg w21, w22, w23 +smull x11, w13, w17 +umull x11, w13, w17 +smnegl x11, w13, w17 +umnegl x11, w13, w17 + +#------------------------------------------------------------------------------ +# Extract (immediate) +#------------------------------------------------------------------------------ + +extr w3, w5, w7, #0 +extr w11, w13, w17, #31 +extr x3, x5, x7, #15 +extr x11, x13, x17, #63 +ror x19, x23, #24 +ror x29, xzr, #63 +ror w9, w13, #31 + +#------------------------------------------------------------------------------ +# Floating-point compare +#------------------------------------------------------------------------------ + +fcmp s3, s5 +fcmp s31, #0.0 +fcmp s31, #0.0 +fcmpe s29, s30 +fcmpe s15, #0.0 +fcmpe s15, #0.0 +fcmp d4, d12 +fcmp d23, #0.0 +fcmp d23, #0.0 +fcmpe d26, d22 +fcmpe d29, #0.0 +fcmpe d29, #0.0 + +#------------------------------------------------------------------------------ +# Floating-point conditional compare +#------------------------------------------------------------------------------ + +fccmp s1, s31, #0, eq +fccmp s3, s0, #15, hs +fccmp s31, s15, #13, hs +fccmp d9, d31, #0, le +fccmp d3, d0, #15, gt +fccmp d31, d5, #7, ne +fccmpe s1, s31, #0, eq +fccmpe s3, s0, #15, hs +fccmpe s31, s15, #13, hs +fccmpe d9, d31, #0, le +fccmpe d3, d0, #15, gt +fccmpe d31, d5, #7, ne + +#------------------------------------------------------------------------------- +# Floating-point conditional compare +#------------------------------------------------------------------------------- + +fcsel s3, s20, s9, pl +fcsel d9, d10, d11, mi + +#------------------------------------------------------------------------------ +# Floating-point data-processing (1 source) +#------------------------------------------------------------------------------ + +fmov s0, s1 +fabs s2, s3 +fneg s4, s5 +fsqrt s6, s7 +fcvt d8, s9 +fcvt h10, s11 +frintn s12, s13 +frintp s14, s15 +frintm s16, s17 +frintz s18, s19 +frinta s20, s21 +frintx s22, s23 +frinti s24, s25 +fmov d0, d1 +fabs d2, d3 +fneg d4, d5 +fsqrt d6, d7 +fcvt s8, d9 +fcvt h10, d11 +frintn d12, d13 +frintp d14, d15 +frintm d16, d17 +frintz d18, d19 +frinta d20, d21 +frintx d22, d23 +frinti d24, d25 +fcvt s26, h27 +fcvt d28, h29 + +#------------------------------------------------------------------------------ +# Floating-point data-processing (2 sources) +#------------------------------------------------------------------------------ + +fmul s20, s19, s17 +fdiv s1, s2, s3 +fadd s4, s5, s6 +fsub s7, s8, s9 +fmax s10, s11, s12 +fmin s13, s14, s15 +fmaxnm s16, s17, s18 +fminnm s19, s20, s21 +fnmul s22, s23, s2 +fmul d20, d19, d17 +fdiv d1, d2, d3 +fadd d4, d5, d6 +fsub d7, d8, d9 +fmax d10, d11, d12 +fmin d13, d14, d15 +fmaxnm d16, d17, d18 +fminnm d19, d20, d21 +fnmul d22, d23, d24 + +#------------------------------------------------------------------------------ +# Floating-point data-processing (1 source) +#------------------------------------------------------------------------------ + +fmadd s3, s5, s6, s31 +fmadd d3, d13, d0, d23 +fmsub s3, s5, s6, s31 +fmsub d3, d13, d0, d23 +fnmadd s3, s5, s6, s31 +fnmadd d3, d13, d0, d23 +fnmsub s3, s5, s6, s31 +fnmsub d3, d13, d0, d23 + +#------------------------------------------------------------------------------ +# Floating-point <-> fixed-point conversion +#------------------------------------------------------------------------------ + +fcvtzs w3, h5, #1 +fcvtzs wzr, h20, #13 +fcvtzs w19, h0, #32 +fcvtzs x3, h5, #1 +fcvtzs x12, h30, #45 +fcvtzs x19, h0, #64 +fcvtzs w3, s5, #1 +fcvtzs wzr, s20, #13 +fcvtzs w19, s0, #32 +fcvtzs x3, s5, #1 +fcvtzs x12, s30, #45 +fcvtzs x19, s0, #64 +fcvtzs w3, d5, #1 +fcvtzs wzr, d20, #13 +fcvtzs w19, d0, #32 +fcvtzs x3, d5, #1 +fcvtzs x12, d30, #45 +fcvtzs x19, d0, #64 +fcvtzu w3, h5, #1 +fcvtzu wzr, h20, #13 +fcvtzu w19, h0, #32 +fcvtzu x3, h5, #1 +fcvtzu x12, h30, #45 +fcvtzu x19, h0, #64 +fcvtzu w3, s5, #1 +fcvtzu wzr, s20, #13 +fcvtzu w19, s0, #32 +fcvtzu x3, s5, #1 +fcvtzu x12, s30, #45 +fcvtzu x19, s0, #64 +fcvtzu w3, d5, #1 +fcvtzu wzr, d20, #13 +fcvtzu w19, d0, #32 +fcvtzu x3, d5, #1 +fcvtzu x12, d30, #45 +fcvtzu x19, d0, #64 +scvtf h23, w19, #1 +scvtf h31, wzr, #20 +scvtf h14, w0, #32 +scvtf h23, x19, #1 +scvtf h31, xzr, #20 +scvtf h14, x0, #64 +scvtf s23, w19, #1 +scvtf s31, wzr, #20 +scvtf s14, w0, #32 +scvtf s23, x19, #1 +scvtf s31, xzr, #20 +scvtf s14, x0, #64 +scvtf d23, w19, #1 +scvtf d31, wzr, #20 +scvtf d14, w0, #32 +scvtf d23, x19, #1 +scvtf d31, xzr, #20 +scvtf d14, x0, #64 +ucvtf h23, w19, #1 +ucvtf h31, wzr, #20 +ucvtf h14, w0, #32 +ucvtf h23, x19, #1 +ucvtf h31, xzr, #20 +ucvtf h14, x0, #64 +ucvtf s23, w19, #1 +ucvtf s31, wzr, #20 +ucvtf s14, w0, #32 +ucvtf s23, x19, #1 +ucvtf s31, xzr, #20 +ucvtf s14, x0, #64 +ucvtf d23, w19, #1 +ucvtf d31, wzr, #20 +ucvtf d14, w0, #32 +ucvtf d23, x19, #1 +ucvtf d31, xzr, #20 +ucvtf d14, x0, #64 + +#------------------------------------------------------------------------------ +# Floating-point <-> integer conversion +#------------------------------------------------------------------------------ + +fcvtns w3, h31 +fcvtns xzr, h12 +fcvtnu wzr, h12 +fcvtnu x0, h0 +fcvtps wzr, h9 +fcvtps x12, h20 +fcvtpu w30, h23 +fcvtpu x29, h3 +fcvtms w2, h3 +fcvtms x4, h5 +fcvtmu w6, h7 +fcvtmu x8, h9 +fcvtzs w10, h11 +fcvtzs x12, h13 +fcvtzu w14, h15 +fcvtzu x15, h16 +scvtf h17, w18 +scvtf h19, x20 +ucvtf h21, w22 +scvtf h23, x24 +fcvtas w25, h26 +fcvtas x27, h28 +fcvtau w29, h30 +fcvtau xzr, h0 +fcvtns w3, s31 +fcvtns xzr, s12 +fcvtnu wzr, s12 +fcvtnu x0, s0 +fcvtps wzr, s9 +fcvtps x12, s20 +fcvtpu w30, s23 +fcvtpu x29, s3 +fcvtms w2, s3 +fcvtms x4, s5 +fcvtmu w6, s7 +fcvtmu x8, s9 +fcvtzs w10, s11 +fcvtzs x12, s13 +fcvtzu w14, s15 +fcvtzu x15, s16 +scvtf s17, w18 +scvtf s19, x20 +ucvtf s21, w22 +scvtf s23, x24 +fcvtas w25, s26 +fcvtas x27, s28 +fcvtau w29, s30 +fcvtau xzr, s0 +fcvtns w3, d31 +fcvtns xzr, d12 +fcvtnu wzr, d12 +fcvtnu x0, d0 +fcvtps wzr, d9 +fcvtps x12, d20 +fcvtpu w30, d23 +fcvtpu x29, d3 +fcvtms w2, d3 +fcvtms x4, d5 +fcvtmu w6, d7 +fcvtmu x8, d9 +fcvtzs w10, d11 +fcvtzs x12, d13 +fcvtzu w14, d15 +fcvtzu x15, d16 +scvtf d17, w18 +scvtf d19, x20 +ucvtf d21, w22 +ucvtf d23, x24 +fcvtas w25, d26 +fcvtas x27, d28 +fcvtau w29, d30 +fcvtau xzr, d0 +fmov w3, s9 +fmov s9, w3 +fmov x20, d31 +fmov d1, x15 +fmov x3, v12.d[1] +fmov v1.d[1], x19 + +#------------------------------------------------------------------------------ +# Floating-point immediate +#------------------------------------------------------------------------------ + +fmov s2, #0.12500000 +fmov s3, #1.00000000 +fmov d30, #16.00000000 +fmov s4, #1.06250000 +fmov d10, #1.93750000 +fmov s12, #-1.00000000 +fmov d16, #8.50000000 + +#------------------------------------------------------------------------------ +# Load-register (literal) +#------------------------------------------------------------------------------ + +ldr w3, #0 +ldr x29, #4 +ldrsw xzr, #-4 +ldr s0, #8 +ldr d0, #1048572 +ldr q0, #-1048576 +prfm pldl1strm, #0 +prfm #22, #0 + +#------------------------------------------------------------------------------ +# Load/store exclusive +#------------------------------------------------------------------------------ + +stxrb w18, w8, [sp] +stxrh w24, w15, [x16] +stxr w5, w6, [x17] +stxr w1, x10, [x21] +ldxrb w30, [x0] +ldxrh w17, [x4] +ldxr w22, [sp] +ldxr x11, [x29] +ldxr x11, [x29] +ldxr x11, [x29] +stxp w12, w11, w10, [sp] +stxp wzr, x27, x9, [x12] +ldxp w0, wzr, [sp] +ldxp x17, x0, [x18] +ldxp x17, x0, [x18] +stlxrb w12, w22, [x0] +stlxrh w10, w1, [x1] +stlxr w9, w2, [x2] +stlxr w9, x3, [sp] +ldaxrb w8, [x4] +ldaxrh w7, [x5] +ldaxr w6, [sp] +ldaxr x5, [x6] +ldaxr x5, [x6] +ldaxr x5, [x6] +stlxp w4, w5, w6, [sp] +stlxp wzr, x6, x7, [x1] +ldaxp w5, w18, [sp] +ldaxp x6, x19, [x22] +ldaxp x6, x19, [x22] +stlrb w24, [sp] +stlrh w25, [x30] +stlr w26, [x29] +stlr x27, [x28] +stlr x27, [x28] +stlr x27, [x28] +ldarb w23, [sp] +ldarh w22, [x30] +ldar wzr, [x29] +ldar x21, [x28] +ldar x21, [x28] +ldar x21, [x28] + +#------------------------------------------------------------------------------ +# Load/store (unscaled immediate) +#------------------------------------------------------------------------------ + +sturb w9, [sp] +sturh wzr, [x12, #255] +stur w16, [x0, #-256] +stur x28, [x14, #1] +ldurb w1, [x20, #255] +ldurh w20, [x1, #255] +ldur w12, [sp, #255] +ldur xzr, [x12, #255] +ldursb x9, [x7, #-256] +ldursh x17, [x19, #-256] +ldursw x20, [x15, #-256] +prfum pldl2keep, [sp, #-256] +ldursb w19, [x1, #-256] +ldursh w15, [x21, #-256] +stur b0, [sp, #1] +stur h12, [x12, #-1] +stur s15, [x0, #255] +stur d31, [x5, #25] +stur q9, [x5] +ldur b3, [sp] +ldur h5, [x4, #-256] +ldur s7, [x12, #-1] +ldur d11, [x19, #4] +ldur q13, [x1, #2] + +#------------------------------------------------------------------------------ +# Load/store (immediate post-indexed) +#------------------------------------------------------------------------------ + +strb w9, [x2], #255 +strb w10, [x3], #1 +strb w10, [x3], #-256 +strh w9, [x2], #255 +strh w9, [x2], #1 +strh w10, [x3], #-256 +str w19, [sp], #255 +str w20, [x30], #1 +str w21, [x12], #-256 +str xzr, [x9], #255 +str x2, [x3], #1 +str x19, [x12], #-256 +ldrb w9, [x2], #255 +ldrb w10, [x3], #1 +ldrb w10, [x3], #-256 +ldrh w9, [x2], #255 +ldrh w9, [x2], #1 +ldrh w10, [x3], #-256 +ldr w19, [sp], #255 +ldr w20, [x30], #1 +ldr w21, [x12], #-256 +ldr xzr, [x9], #255 +ldr x2, [x3], #1 +ldr x19, [x12], #-256 +ldrsb xzr, [x9], #255 +ldrsb x2, [x3], #1 +ldrsb x19, [x12], #-256 +ldrsh xzr, [x9], #255 +ldrsh x2, [x3], #1 +ldrsh x19, [x12], #-256 +ldrsw xzr, [x9], #255 +ldrsw x2, [x3], #1 +ldrsw x19, [x12], #-256 +ldrsb wzr, [x9], #255 +ldrsb w2, [x3], #1 +ldrsb w19, [x12], #-256 +ldrsh wzr, [x9], #255 +ldrsh w2, [x3], #1 +ldrsh w19, [x12], #-256 +str b0, [x0], #255 +str b3, [x3], #1 +str b5, [sp], #-256 +str h10, [x10], #255 +str h13, [x23], #1 +str h15, [sp], #-256 +str s20, [x20], #255 +str s23, [x23], #1 +str s25, [x0], #-256 +str d20, [x20], #255 +str d23, [x23], #1 +str d25, [x0], #-256 +ldr b0, [x0], #255 +ldr b3, [x3], #1 +ldr b5, [sp], #-256 +ldr h10, [x10], #255 +ldr h13, [x23], #1 +ldr h15, [sp], #-256 +ldr s20, [x20], #255 +ldr s23, [x23], #1 +ldr s25, [x0], #-256 +ldr d20, [x20], #255 +ldr d23, [x23], #1 +ldr d25, [x0], #-256 +ldr q20, [x1], #255 +ldr q23, [x9], #1 +ldr q25, [x20], #-256 +str q10, [x1], #255 +str q22, [sp], #1 +str q21, [x20], #-256 + +#------------------------------------------------------------------------------- +# Load-store register (immediate pre-indexed) +#------------------------------------------------------------------------------- + +ldr x3, [x4, #0]! +strb w9, [x2, #255]! +strb w10, [x3, #1]! +strb w10, [x3, #-256]! +strh w9, [x2, #255]! +strh w9, [x2, #1]! +strh w10, [x3, #-256]! +str w19, [sp, #255]! +str w20, [x30, #1]! +str w21, [x12, #-256]! +str xzr, [x9, #255]! +str x2, [x3, #1]! +str x19, [x12, #-256]! +ldrb w9, [x2, #255]! +ldrb w10, [x3, #1]! +ldrb w10, [x3, #-256]! +ldrh w9, [x2, #255]! +ldrh w9, [x2, #1]! +ldrh w10, [x3, #-256]! +ldr w19, [sp, #255]! +ldr w20, [x30, #1]! +ldr w21, [x12, #-256]! +ldr xzr, [x9, #255]! +ldr x2, [x3, #1]! +ldr x19, [x12, #-256]! +ldrsb xzr, [x9, #255]! +ldrsb x2, [x3, #1]! +ldrsb x19, [x12, #-256]! +ldrsh xzr, [x9, #255]! +ldrsh x2, [x3, #1]! +ldrsh x19, [x12, #-256]! +ldrsw xzr, [x9, #255]! +ldrsw x2, [x3, #1]! +ldrsw x19, [x12, #-256]! +ldrsb wzr, [x9, #255]! +ldrsb w2, [x3, #1]! +ldrsb w19, [x12, #-256]! +ldrsh wzr, [x9, #255]! +ldrsh w2, [x3, #1]! +ldrsh w19, [x12, #-256]! +str b0, [x0, #255]! +str b3, [x3, #1]! +str b5, [sp, #-256]! +str h10, [x10, #255]! +str h13, [x23, #1]! +str h15, [sp, #-256]! +str s20, [x20, #255]! +str s23, [x23, #1]! +str s25, [x0, #-256]! +str d20, [x20, #255]! +str d23, [x23, #1]! +str d25, [x0, #-256]! +ldr b0, [x0, #255]! +ldr b3, [x3, #1]! +ldr b5, [sp, #-256]! +ldr h10, [x10, #255]! +ldr h13, [x23, #1]! +ldr h15, [sp, #-256]! +ldr s20, [x20, #255]! +ldr s23, [x23, #1]! +ldr s25, [x0, #-256]! +ldr d20, [x20, #255]! +ldr d23, [x23, #1]! +ldr d25, [x0, #-256]! +ldr q20, [x1, #255]! +ldr q23, [x9, #1]! +ldr q25, [x20, #-256]! +str q10, [x1, #255]! +str q22, [sp, #1]! +str q21, [x20, #-256]! + +#------------------------------------------------------------------------------ +# Load/store (unprivileged) +#------------------------------------------------------------------------------ + +sttrb w9, [sp] +sttrh wzr, [x12, #255] +sttr w16, [x0, #-256] +sttr x28, [x14, #1] +ldtrb w1, [x20, #255] +ldtrh w20, [x1, #255] +ldtr w12, [sp, #255] +ldtr xzr, [x12, #255] +ldtrsb x9, [x7, #-256] +ldtrsh x17, [x19, #-256] +ldtrsw x20, [x15, #-256] +ldtrsb w19, [x1, #-256] +ldtrsh w15, [x21, #-256] + +#------------------------------------------------------------------------------ +# Load/store (unsigned immediate) +#------------------------------------------------------------------------------ + +ldr x4, [x29] +ldr x30, [x12, #32760] +ldr x20, [sp, #8] +ldr xzr, [sp] +ldr w2, [sp] +ldr w17, [sp, #16380] +ldr w13, [x2, #4] +ldrsw x2, [x5, #4] +ldrsw x23, [sp, #16380] +ldrh w2, [x4] +ldrsh w23, [x6, #8190] +ldrsh wzr, [sp, #2] +ldrsh x29, [x2, #2] +ldrb w26, [x3, #121] +ldrb w12, [x2] +ldrsb w27, [sp, #4095] +ldrsb xzr, [x15] +str x30, [sp] +str w20, [x4, #16380] +strh w17, [sp, #8190] +strb w23, [x3, #4095] +strb wzr, [x2] +ldr b31, [sp, #4095] +ldr h20, [x2, #8190] +ldr s10, [x19, #16380] +ldr d3, [x10, #32760] +str q12, [sp, #65520] + +#------------------------------------------------------------------------------ +# Load/store (register offset) +#------------------------------------------------------------------------------ + +ldrb w3, [sp, x5] +ldrb w9, [x27, x6] +ldrsb w10, [x30, x7] +ldrb w11, [x29, x3, sxtx] +strb w12, [x28, xzr, sxtx] +ldrb w14, [x26, w6, uxtw] +ldrsb w15, [x25, w7, uxtw] +ldrb w17, [x23, w9, sxtw] +ldrsb x18, [x22, w10, sxtw] +ldrsh w3, [sp, x5] +ldrsh w9, [x27, x6] +ldrh w10, [x30, x7, lsl #1] +strh w11, [x29, x3, sxtx] +ldrh w12, [x28, xzr, sxtx] +ldrsh x13, [x27, x5, sxtx #1] +ldrh w14, [x26, w6, uxtw] +ldrh w15, [x25, w7, uxtw] +ldrsh w16, [x24, w8, uxtw #1] +ldrh w17, [x23, w9, sxtw] +ldrh w18, [x22, w10, sxtw] +strh w19, [x21, wzr, sxtw #1] +ldr w3, [sp, x5] +ldr s9, [x27, x6] +ldr w10, [x30, x7, lsl #2] +ldr w11, [x29, x3, sxtx] +str s12, [x28, xzr, sxtx] +str w13, [x27, x5, sxtx #2] +str w14, [x26, w6, uxtw] +ldr w15, [x25, w7, uxtw] +ldr w16, [x24, w8, uxtw #2] +ldrsw x17, [x23, w9, sxtw] +ldr w18, [x22, w10, sxtw] +ldrsw x19, [x21, wzr, sxtw #2] +ldr x3, [sp, x5] +str x9, [x27, x6] +ldr d10, [x30, x7, lsl #3] +str x11, [x29, x3, sxtx] +ldr x12, [x28, xzr, sxtx] +ldr x13, [x27, x5, sxtx #3] +prfm pldl1keep, [x26, w6, uxtw] +ldr x15, [x25, w7, uxtw] +ldr x16, [x24, w8, uxtw #3] +ldr x17, [x23, w9, sxtw] +ldr x18, [x22, w10, sxtw] +str d19, [x21, wzr, sxtw #3] +ldr q3, [sp, x5] +ldr q9, [x27, x6] +ldr q10, [x30, x7, lsl #4] +str q11, [x29, x3, sxtx] +str q12, [x28, xzr, sxtx] +str q13, [x27, x5, sxtx #4] +ldr q14, [x26, w6, uxtw] +ldr q15, [x25, w7, uxtw] +ldr q16, [x24, w8, uxtw #4] +ldr q17, [x23, w9, sxtw] +str q18, [x22, w10, sxtw] +ldr q19, [x21, wzr, sxtw #4] + +#------------------------------------------------------------------------------ +# Load/store register pair (offset) +#------------------------------------------------------------------------------ + +ldp w3, w5, [sp] +stp wzr, w9, [sp, #252] +ldp w2, wzr, [sp, #-256] +ldp w9, w10, [sp, #4] +ldpsw x9, x10, [sp, #4] +ldpsw x9, x10, [x2, #-256] +ldpsw x20, x30, [sp, #252] +ldp x21, x29, [x2, #504] +ldp x22, x23, [x3, #-512] +ldp x24, x25, [x4, #8] +ldp s29, s28, [sp, #252] +stp s27, s26, [sp, #-256] +ldp s1, s2, [x3, #44] +stp d3, d5, [x9, #504] +stp d7, d11, [x10, #-512] +ldp d2, d3, [x30, #-8] +stp q3, q5, [sp] +stp q17, q19, [sp, #1008] +ldp q23, q29, [x1, #-1024] + +#------------------------------------------------------------------------------ +# Load/store register pair (post-indexed) +#------------------------------------------------------------------------------ + +ldp w3, w5, [sp], #0 +stp wzr, w9, [sp], #252 +ldp w2, wzr, [sp], #-256 +ldp w9, w10, [sp], #4 +ldpsw x9, x10, [sp], #4 +ldpsw x9, x10, [x2], #-256 +ldpsw x20, x30, [sp], #252 +ldp x21, x29, [x2], #504 +ldp x22, x23, [x3], #-512 +ldp x24, x25, [x4], #8 +ldp s29, s28, [sp], #252 +stp s27, s26, [sp], #-256 +ldp s1, s2, [x3], #44 +stp d3, d5, [x9], #504 +stp d7, d11, [x10], #-512 +ldp d2, d3, [x30], #-8 +stp q3, q5, [sp], #0 +stp q17, q19, [sp], #1008 +ldp q23, q29, [x1], #-1024 + +#------------------------------------------------------------------------------ +# Load/store register pair (pre-indexed) +#------------------------------------------------------------------------------ + +ldp w3, w5, [sp, #0]! +stp wzr, w9, [sp, #252]! +ldp w2, wzr, [sp, #-256]! +ldp w9, w10, [sp, #4]! +ldpsw x9, x10, [sp, #4]! +ldpsw x9, x10, [x2, #-256]! +ldpsw x20, x30, [sp, #252]! +ldp x21, x29, [x2, #504]! +ldp x22, x23, [x3, #-512]! +ldp x24, x25, [x4, #8]! +ldp s29, s28, [sp, #252]! +stp s27, s26, [sp, #-256]! +ldp s1, s2, [x3, #44]! +stp d3, d5, [x9, #504]! +stp d7, d11, [x10, #-512]! +ldp d2, d3, [x30, #-8]! +stp q3, q5, [sp, #0]! +stp q17, q19, [sp, #1008]! +ldp q23, q29, [x1, #-1024]! + +#------------------------------------------------------------------------------ +# Load/store register pair (offset) +#------------------------------------------------------------------------------ + +ldnp w3, w5, [sp] +stnp wzr, w9, [sp, #252] +ldnp w2, wzr, [sp, #-256] +ldnp w9, w10, [sp, #4] +ldnp x21, x29, [x2, #504] +ldnp x22, x23, [x3, #-512] +ldnp x24, x25, [x4, #8] +ldnp s29, s28, [sp, #252] +stnp s27, s26, [sp, #-256] +ldnp s1, s2, [x3, #44] +stnp d3, d5, [x9, #504] +stnp d7, d11, [x10, #-512] +ldnp d2, d3, [x30, #-8] +stnp q3, q5, [sp] +stnp q17, q19, [sp, #1008] +ldnp q23, q29, [x1, #-1024] + +#------------------------------------------------------------------------------ +# Logical (immediate) +#------------------------------------------------------------------------------ + +mov w3, #983055 +mov x10, #-6148914691236517206 + +#------------------------------------------------------------------------------ +# Logical (shifted register) +#------------------------------------------------------------------------------ + +and w12, w23, w21 +and w16, w15, w1, lsl #1 +and w9, w4, w10, lsl #31 +and w3, w30, w11 +and x3, x5, x7, lsl #63 +and x5, x14, x19, asr #4 +and w3, w17, w19, ror #31 +and w0, w2, wzr, lsr #17 +and w3, w30, w11, asr #2 +and xzr, x4, x26 +and w3, wzr, w20, ror #2 +and x7, x20, xzr, asr #63 +bic x13, x20, x14, lsl #47 +bic w2, w7, w9 +orr w2, w7, w0, asr #31 +orr x8, x9, x10, lsl #12 +orn x3, x5, x7, asr #2 +orn w2, w5, w29 +ands w7, wzr, w9, lsl #1 +ands x3, x5, x20, ror #63 +bics w3, w5, w7 +bics x3, xzr, x3, lsl #1 +tst w3, w7, lsl #31 +tst x2, x20, asr #2 +mov x3, x6 +mov x3, xzr +mov wzr, w2 +mov w3, w5 + +#------------------------------------------------------------------------------ +# Move wide (immediate) +#------------------------------------------------------------------------------ + +movz w2, #0, lsl #16 +mov w2, #-1235 +mov x2, #5299989643264 +mov x2, #0 +movk w3, #0 +movz x4, #0, lsl #16 +movk w5, #0, lsl #16 +movz x6, #0, lsl #32 +movk x7, #0, lsl #32 +movz x8, #0, lsl #48 +movk x9, #0, lsl #48 + +#------------------------------------------------------------------------------ +# PC-relative addressing +#------------------------------------------------------------------------------ + +adr x2, #1600 +adrp x21, #6553600 +adr x0, #262144 + +#------------------------------------------------------------------------------ +# Test and branch (immediate) +#------------------------------------------------------------------------------ + +tbz x12, #62, #0 +tbz x12, #62, #4 +tbz x12, #62, #-32768 +tbnz x12, #60, #32764 + +#------------------------------------------------------------------------------ +# Unconditional branch (immediate) +#------------------------------------------------------------------------------ + +b #4 +b #-4 +b #134217724 + +#------------------------------------------------------------------------------ +# Unconditional branch (register) +#------------------------------------------------------------------------------ + +br x20 +blr xzr +ret x10 +ret +eret +drps + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 add w2, w3, #4095 +# CHECK-NEXT: 1 1 0.25 add w30, w29, #1, lsl #12 +# CHECK-NEXT: 1 1 0.25 add w13, w5, #4095, lsl #12 +# CHECK-NEXT: 1 1 0.25 add x5, x7, #1638 +# CHECK-NEXT: 1 1 0.25 add w20, wsp, #801 +# CHECK-NEXT: 1 1 0.25 add wsp, wsp, #1104 +# CHECK-NEXT: 1 1 0.25 add wsp, w30, #4084 +# CHECK-NEXT: 1 1 0.25 add x0, x24, #291 +# CHECK-NEXT: 1 1 0.25 add x3, x24, #4095, lsl #12 +# CHECK-NEXT: 1 1 0.25 add x8, sp, #1074 +# CHECK-NEXT: 1 1 0.25 add sp, x29, #3816 +# CHECK-NEXT: 1 1 0.25 sub w0, wsp, #4077 +# CHECK-NEXT: 1 1 0.25 sub w4, w20, #546, lsl #12 +# CHECK-NEXT: 1 1 0.25 sub sp, sp, #288 +# CHECK-NEXT: 1 1 0.25 sub wsp, w19, #16 +# CHECK-NEXT: 1 1 0.50 adds w13, w23, #291, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmn w2, #4095 +# CHECK-NEXT: 1 1 0.50 adds w20, wsp, #0 +# CHECK-NEXT: 1 1 0.50 cmn x3, #1, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmp sp, #20, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmp x30, #4095 +# CHECK-NEXT: 1 1 0.50 subs x4, sp, #3822 +# CHECK-NEXT: 1 1 0.50 cmn w3, #291, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmn wsp, #1365 +# CHECK-NEXT: 1 1 0.50 cmn sp, #1092, lsl #12 +# CHECK-NEXT: 1 1 0.25 mov sp, x30 +# CHECK-NEXT: 1 1 0.25 mov wsp, w20 +# CHECK-NEXT: 1 1 0.25 mov x11, sp +# CHECK-NEXT: 1 1 0.25 mov w24, wsp +# CHECK-NEXT: 1 1 0.25 add w3, w5, w7 +# CHECK-NEXT: 1 1 0.25 add wzr, w3, w5 +# CHECK-NEXT: 1 1 0.25 add w20, wzr, w4 +# CHECK-NEXT: 1 1 0.25 add w4, w6, wzr +# CHECK-NEXT: 1 1 0.25 add w11, w13, w15 +# CHECK-NEXT: 2 2 0.50 add w9, w3, wzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 add w17, w29, w20, lsl #31 +# CHECK-NEXT: 2 2 0.50 add w21, w22, w23, lsr #0 +# CHECK-NEXT: 2 2 0.50 add w24, w25, w26, lsr #18 +# CHECK-NEXT: 2 2 0.50 add w27, w28, w29, lsr #31 +# CHECK-NEXT: 2 2 0.50 add w2, w3, w4, asr #0 +# CHECK-NEXT: 2 2 0.50 add w5, w6, w7, asr #21 +# CHECK-NEXT: 2 2 0.50 add w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.25 add x3, x5, x7 +# CHECK-NEXT: 1 1 0.25 add xzr, x3, x5 +# CHECK-NEXT: 1 1 0.25 add x20, xzr, x4 +# CHECK-NEXT: 1 1 0.25 add x4, x6, xzr +# CHECK-NEXT: 1 1 0.25 add x11, x13, x15 +# CHECK-NEXT: 2 2 0.50 add x9, x3, xzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 add x17, x29, x20, lsl #63 +# CHECK-NEXT: 2 2 0.50 add x21, x22, x23, lsr #0 +# CHECK-NEXT: 2 2 0.50 add x24, x25, x26, lsr #18 +# CHECK-NEXT: 2 2 0.50 add x27, x28, x29, lsr #63 +# CHECK-NEXT: 2 2 0.50 add x2, x3, x4, asr #0 +# CHECK-NEXT: 2 2 0.50 add x5, x6, x7, asr #21 +# CHECK-NEXT: 2 2 0.50 add x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.25 adds w3, w5, w7 +# CHECK-NEXT: 1 1 0.25 cmn w3, w5 +# CHECK-NEXT: 1 1 0.25 adds w20, wzr, w4 +# CHECK-NEXT: 1 1 0.25 adds w4, w6, wzr +# CHECK-NEXT: 1 1 0.25 adds w11, w13, w15 +# CHECK-NEXT: 2 2 0.50 adds w9, w3, wzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 adds w17, w29, w20, lsl #31 +# CHECK-NEXT: 2 2 0.50 adds w21, w22, w23, lsr #0 +# CHECK-NEXT: 2 2 0.50 adds w24, w25, w26, lsr #18 +# CHECK-NEXT: 2 2 0.50 adds w27, w28, w29, lsr #31 +# CHECK-NEXT: 2 2 0.50 adds w2, w3, w4, asr #0 +# CHECK-NEXT: 2 2 0.50 adds w5, w6, w7, asr #21 +# CHECK-NEXT: 2 2 0.50 adds w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.25 adds x3, x5, x7 +# CHECK-NEXT: 1 1 0.25 cmn x3, x5 +# CHECK-NEXT: 1 1 0.25 adds x20, xzr, x4 +# CHECK-NEXT: 1 1 0.25 adds x4, x6, xzr +# CHECK-NEXT: 1 1 0.25 adds x11, x13, x15 +# CHECK-NEXT: 2 2 0.50 adds x9, x3, xzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 adds x17, x29, x20, lsl #63 +# CHECK-NEXT: 2 2 0.50 adds x21, x22, x23, lsr #0 +# CHECK-NEXT: 2 2 0.50 adds x24, x25, x26, lsr #18 +# CHECK-NEXT: 2 2 0.50 adds x27, x28, x29, lsr #63 +# CHECK-NEXT: 2 2 0.50 adds x2, x3, x4, asr #0 +# CHECK-NEXT: 2 2 0.50 adds x5, x6, x7, asr #21 +# CHECK-NEXT: 2 2 0.50 adds x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.25 sub w3, w5, w7 +# CHECK-NEXT: 1 1 0.25 sub wzr, w3, w5 +# CHECK-NEXT: 1 1 0.25 sub w4, w6, wzr +# CHECK-NEXT: 1 1 0.25 sub w11, w13, w15 +# CHECK-NEXT: 2 2 0.50 sub w9, w3, wzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 sub w17, w29, w20, lsl #31 +# CHECK-NEXT: 2 2 0.50 sub w21, w22, w23, lsr #0 +# CHECK-NEXT: 2 2 0.50 sub w24, w25, w26, lsr #18 +# CHECK-NEXT: 2 2 0.50 sub w27, w28, w29, lsr #31 +# CHECK-NEXT: 2 2 0.50 sub w2, w3, w4, asr #0 +# CHECK-NEXT: 2 2 0.50 sub w5, w6, w7, asr #21 +# CHECK-NEXT: 2 2 0.50 sub w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.25 sub x3, x5, x7 +# CHECK-NEXT: 1 1 0.25 sub xzr, x3, x5 +# CHECK-NEXT: 1 1 0.25 sub x4, x6, xzr +# CHECK-NEXT: 1 1 0.25 sub x11, x13, x15 +# CHECK-NEXT: 2 2 0.50 sub x9, x3, xzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 sub x17, x29, x20, lsl #63 +# CHECK-NEXT: 2 2 0.50 sub x21, x22, x23, lsr #0 +# CHECK-NEXT: 2 2 0.50 sub x24, x25, x26, lsr #18 +# CHECK-NEXT: 2 2 0.50 sub x27, x28, x29, lsr #63 +# CHECK-NEXT: 2 2 0.50 sub x2, x3, x4, asr #0 +# CHECK-NEXT: 2 2 0.50 sub x5, x6, x7, asr #21 +# CHECK-NEXT: 2 2 0.50 sub x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.25 subs w3, w5, w7 +# CHECK-NEXT: 1 1 0.25 cmp w3, w5 +# CHECK-NEXT: 1 1 0.25 subs w4, w6, wzr +# CHECK-NEXT: 1 1 0.25 subs w11, w13, w15 +# CHECK-NEXT: 2 2 0.50 subs w9, w3, wzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 subs w17, w29, w20, lsl #31 +# CHECK-NEXT: 2 2 0.50 subs w21, w22, w23, lsr #0 +# CHECK-NEXT: 2 2 0.50 subs w24, w25, w26, lsr #18 +# CHECK-NEXT: 2 2 0.50 subs w27, w28, w29, lsr #31 +# CHECK-NEXT: 2 2 0.50 subs w2, w3, w4, asr #0 +# CHECK-NEXT: 2 2 0.50 subs w5, w6, w7, asr #21 +# CHECK-NEXT: 2 2 0.50 subs w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.25 subs x3, x5, x7 +# CHECK-NEXT: 1 1 0.25 cmp x3, x5 +# CHECK-NEXT: 1 1 0.25 subs x4, x6, xzr +# CHECK-NEXT: 1 1 0.25 subs x11, x13, x15 +# CHECK-NEXT: 2 2 0.50 subs x9, x3, xzr, lsl #10 +# CHECK-NEXT: 2 2 0.50 subs x17, x29, x20, lsl #63 +# CHECK-NEXT: 2 2 0.50 subs x21, x22, x23, lsr #0 +# CHECK-NEXT: 2 2 0.50 subs x24, x25, x26, lsr #18 +# CHECK-NEXT: 2 2 0.50 subs x27, x28, x29, lsr #63 +# CHECK-NEXT: 2 2 0.50 subs x2, x3, x4, asr #0 +# CHECK-NEXT: 2 2 0.50 subs x5, x6, x7, asr #21 +# CHECK-NEXT: 2 2 0.50 subs x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.25 cmn wzr, w4 +# CHECK-NEXT: 1 1 0.25 cmn w5, wzr +# CHECK-NEXT: 1 1 0.25 cmn w6, w7 +# CHECK-NEXT: 2 2 0.50 cmn w8, w9, lsl #15 +# CHECK-NEXT: 2 2 0.50 cmn w10, w11, lsl #31 +# CHECK-NEXT: 2 2 0.50 cmn w12, w13, lsr #0 +# CHECK-NEXT: 2 2 0.50 cmn w14, w15, lsr #21 +# CHECK-NEXT: 2 2 0.50 cmn w16, w17, lsr #31 +# CHECK-NEXT: 2 2 0.50 cmn w18, w19, asr #0 +# CHECK-NEXT: 2 2 0.50 cmn w20, w21, asr #22 +# CHECK-NEXT: 2 2 0.50 cmn w22, w23, asr #31 +# CHECK-NEXT: 1 1 0.25 cmn x0, x3 +# CHECK-NEXT: 1 1 0.25 cmn xzr, x4 +# CHECK-NEXT: 1 1 0.25 cmn x5, xzr +# CHECK-NEXT: 1 1 0.25 cmn x6, x7 +# CHECK-NEXT: 2 2 0.50 cmn x8, x9, lsl #15 +# CHECK-NEXT: 2 2 0.50 cmn x10, x11, lsl #63 +# CHECK-NEXT: 2 2 0.50 cmn x12, x13, lsr #0 +# CHECK-NEXT: 2 2 0.50 cmn x14, x15, lsr #41 +# CHECK-NEXT: 2 2 0.50 cmn x16, x17, lsr #63 +# CHECK-NEXT: 2 2 0.50 cmn x18, x19, asr #0 +# CHECK-NEXT: 2 2 0.50 cmn x20, x21, asr #55 +# CHECK-NEXT: 2 2 0.50 cmn x22, x23, asr #63 +# CHECK-NEXT: 1 1 0.25 cmp w0, w3 +# CHECK-NEXT: 1 1 0.25 cmp wzr, w4 +# CHECK-NEXT: 1 1 0.25 cmp w5, wzr +# CHECK-NEXT: 1 1 0.25 cmp w6, w7 +# CHECK-NEXT: 2 2 0.50 cmp w8, w9, lsl #15 +# CHECK-NEXT: 2 2 0.50 cmp w10, w11, lsl #31 +# CHECK-NEXT: 2 2 0.50 cmp w12, w13, lsr #0 +# CHECK-NEXT: 2 2 0.50 cmp w14, w15, lsr #21 +# CHECK-NEXT: 2 2 0.50 cmp w18, w19, asr #0 +# CHECK-NEXT: 2 2 0.50 cmp w20, w21, asr #22 +# CHECK-NEXT: 2 2 0.50 cmp w22, w23, asr #31 +# CHECK-NEXT: 1 1 0.25 cmp x0, x3 +# CHECK-NEXT: 1 1 0.25 cmp xzr, x4 +# CHECK-NEXT: 1 1 0.25 cmp x5, xzr +# CHECK-NEXT: 1 1 0.25 cmp x6, x7 +# CHECK-NEXT: 2 2 0.50 cmp x8, x9, lsl #15 +# CHECK-NEXT: 2 2 0.50 cmp x10, x11, lsl #63 +# CHECK-NEXT: 2 2 0.50 cmp x12, x13, lsr #0 +# CHECK-NEXT: 2 2 0.50 cmp x14, x15, lsr #41 +# CHECK-NEXT: 2 2 0.50 cmp x16, x17, lsr #63 +# CHECK-NEXT: 2 2 0.50 cmp x18, x19, asr #0 +# CHECK-NEXT: 2 2 0.50 cmp x20, x21, asr #55 +# CHECK-NEXT: 2 2 0.50 cmp x22, x23, asr #63 +# CHECK-NEXT: 1 1 0.25 cmp wzr, w0 +# CHECK-NEXT: 1 1 0.25 cmp xzr, x0 +# CHECK-NEXT: 1 1 0.50 adc w29, w27, w25 +# CHECK-NEXT: 1 1 0.50 adc wzr, w3, w4 +# CHECK-NEXT: 1 1 0.50 adc w9, wzr, w10 +# CHECK-NEXT: 1 1 0.50 adc w20, w0, wzr +# CHECK-NEXT: 1 1 0.50 adc x29, x27, x25 +# CHECK-NEXT: 1 1 0.50 adc xzr, x3, x4 +# CHECK-NEXT: 1 1 0.50 adc x9, xzr, x10 +# CHECK-NEXT: 1 1 0.50 adc x20, x0, xzr +# CHECK-NEXT: 1 1 0.50 adcs w29, w27, w25 +# CHECK-NEXT: 1 1 0.50 adcs wzr, w3, w4 +# CHECK-NEXT: 1 1 0.50 adcs w9, wzr, w10 +# CHECK-NEXT: 1 1 0.50 adcs w20, w0, wzr +# CHECK-NEXT: 1 1 0.50 adcs x29, x27, x25 +# CHECK-NEXT: 1 1 0.50 adcs xzr, x3, x4 +# CHECK-NEXT: 1 1 0.50 adcs x9, xzr, x10 +# CHECK-NEXT: 1 1 0.50 adcs x20, x0, xzr +# CHECK-NEXT: 1 1 0.50 sbc w29, w27, w25 +# CHECK-NEXT: 1 1 0.50 sbc wzr, w3, w4 +# CHECK-NEXT: 1 1 0.50 ngc w9, w10 +# CHECK-NEXT: 1 1 0.50 sbc w20, w0, wzr +# CHECK-NEXT: 1 1 0.50 sbc x29, x27, x25 +# CHECK-NEXT: 1 1 0.50 sbc xzr, x3, x4 +# CHECK-NEXT: 1 1 0.50 ngc x9, x10 +# CHECK-NEXT: 1 1 0.50 sbc x20, x0, xzr +# CHECK-NEXT: 1 1 0.50 sbcs w29, w27, w25 +# CHECK-NEXT: 1 1 0.50 sbcs wzr, w3, w4 +# CHECK-NEXT: 1 1 0.50 ngcs w9, w10 +# CHECK-NEXT: 1 1 0.50 sbcs w20, w0, wzr +# CHECK-NEXT: 1 1 0.50 sbcs x29, x27, x25 +# CHECK-NEXT: 1 1 0.50 sbcs xzr, x3, x4 +# CHECK-NEXT: 1 1 0.50 ngcs x9, x10 +# CHECK-NEXT: 1 1 0.50 sbcs x20, x0, xzr +# CHECK-NEXT: 1 1 0.50 ngc w3, w12 +# CHECK-NEXT: 1 1 0.50 ngc wzr, w9 +# CHECK-NEXT: 1 1 0.50 ngc w23, wzr +# CHECK-NEXT: 1 1 0.50 ngc x29, x30 +# CHECK-NEXT: 1 1 0.50 ngc xzr, x0 +# CHECK-NEXT: 1 1 0.50 ngc x0, xzr +# CHECK-NEXT: 1 1 0.50 ngcs w3, w12 +# CHECK-NEXT: 1 1 0.50 ngcs wzr, w9 +# CHECK-NEXT: 1 1 0.50 ngcs w23, wzr +# CHECK-NEXT: 1 1 0.50 ngcs x29, x30 +# CHECK-NEXT: 1 1 0.50 ngcs xzr, x0 +# CHECK-NEXT: 1 1 0.50 ngcs x0, xzr +# CHECK-NEXT: 1 1 0.50 sbfx x1, x2, #3, #2 +# CHECK-NEXT: 1 1 0.50 asr x3, x4, #63 +# CHECK-NEXT: 1 1 0.50 asr wzr, wzr, #31 +# CHECK-NEXT: 1 1 0.50 sbfx w12, w9, #0, #1 +# CHECK-NEXT: 1 1 0.50 ubfiz x4, x5, #52, #11 +# CHECK-NEXT: 1 1 0.50 ubfx xzr, x4, #0, #1 +# CHECK-NEXT: 1 1 0.50 ubfiz x4, xzr, #1, #6 +# CHECK-NEXT: 1 1 0.50 lsr x5, x6, #12 +# CHECK-NEXT: 1 1 0.50 bfi x4, x5, #52, #11 +# CHECK-NEXT: 1 1 0.50 bfxil xzr, x4, #0, #1 +# CHECK-NEXT: 1 1 0.50 bfc x4, #1, #6 +# CHECK-NEXT: 1 1 0.50 bfxil x5, x6, #12, #52 +# CHECK-NEXT: 1 1 0.50 sxtb w1, w2 +# CHECK-NEXT: 1 1 0.50 sxtb xzr, w3 +# CHECK-NEXT: 1 1 0.50 sxth w9, w10 +# CHECK-NEXT: 1 1 0.50 sxth x0, w1 +# CHECK-NEXT: 1 1 0.50 sxtw x3, w30 +# CHECK-NEXT: 1 1 0.50 uxtb w1, w2 +# CHECK-NEXT: 1 1 0.50 uxth w9, w10 +# CHECK-NEXT: 1 1 0.50 ubfx x3, x30, #0, #32 +# CHECK-NEXT: 1 1 0.50 asr w3, w2, #0 +# CHECK-NEXT: 1 1 0.50 asr w9, w10, #31 +# CHECK-NEXT: 1 1 0.50 asr x20, x21, #63 +# CHECK-NEXT: 1 1 0.50 asr w1, wzr, #3 +# CHECK-NEXT: 1 1 0.50 lsr w3, w2, #0 +# CHECK-NEXT: 1 1 0.50 lsr w9, w10, #31 +# CHECK-NEXT: 1 1 0.50 lsr x20, x21, #63 +# CHECK-NEXT: 1 1 0.50 lsr wzr, wzr, #3 +# CHECK-NEXT: 1 1 0.50 lsr w3, w2, #0 +# CHECK-NEXT: 1 1 0.50 lsl w9, w10, #31 +# CHECK-NEXT: 1 1 0.50 lsl x20, x21, #63 +# CHECK-NEXT: 1 1 0.50 lsl w1, wzr, #3 +# CHECK-NEXT: 1 1 0.50 sbfx w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 sbfiz x2, x3, #63, #1 +# CHECK-NEXT: 1 1 0.50 asr x19, x20, #0 +# CHECK-NEXT: 1 1 0.50 sbfiz x9, x10, #5, #59 +# CHECK-NEXT: 1 1 0.50 asr w9, w10, #0 +# CHECK-NEXT: 1 1 0.50 sbfiz w11, w12, #31, #1 +# CHECK-NEXT: 1 1 0.50 sbfiz w13, w14, #29, #3 +# CHECK-NEXT: 1 1 0.50 sbfiz xzr, xzr, #10, #11 +# CHECK-NEXT: 1 1 0.50 sbfx w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 asr x2, x3, #63 +# CHECK-NEXT: 1 1 0.50 asr x19, x20, #0 +# CHECK-NEXT: 1 1 0.50 asr x9, x10, #5 +# CHECK-NEXT: 1 1 0.50 asr w9, w10, #0 +# CHECK-NEXT: 1 1 0.50 asr w11, w12, #31 +# CHECK-NEXT: 1 1 0.50 asr w13, w14, #29 +# CHECK-NEXT: 1 1 0.50 sbfx xzr, xzr, #10, #11 +# CHECK-NEXT: 1 1 0.50 bfxil w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 bfi x2, x3, #63, #1 +# CHECK-NEXT: 1 1 0.50 bfxil x19, x20, #0, #64 +# CHECK-NEXT: 1 1 0.50 bfi x9, x10, #5, #59 +# CHECK-NEXT: 1 1 0.50 bfxil w9, w10, #0, #32 +# CHECK-NEXT: 1 1 0.50 bfi w11, w12, #31, #1 +# CHECK-NEXT: 1 1 0.50 bfi w13, w14, #29, #3 +# CHECK-NEXT: 1 1 0.50 bfc xzr, #10, #11 +# CHECK-NEXT: 1 1 0.50 bfxil w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 bfxil x2, x3, #63, #1 +# CHECK-NEXT: 1 1 0.50 bfxil x19, x20, #0, #64 +# CHECK-NEXT: 1 1 0.50 bfxil x9, x10, #5, #59 +# CHECK-NEXT: 1 1 0.50 bfxil w9, w10, #0, #32 +# CHECK-NEXT: 1 1 0.50 bfxil w11, w12, #31, #1 +# CHECK-NEXT: 1 1 0.50 bfxil w13, w14, #29, #3 +# CHECK-NEXT: 1 1 0.50 bfxil xzr, xzr, #10, #11 +# CHECK-NEXT: 1 1 0.50 ubfx w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 lsl x2, x3, #63 +# CHECK-NEXT: 1 1 0.50 lsr x19, x20, #0 +# CHECK-NEXT: 1 1 0.50 lsl x9, x10, #5 +# CHECK-NEXT: 1 1 0.50 lsr w9, w10, #0 +# CHECK-NEXT: 1 1 0.50 lsl w11, w12, #31 +# CHECK-NEXT: 1 1 0.50 lsl w13, w14, #29 +# CHECK-NEXT: 1 1 0.50 ubfiz xzr, xzr, #10, #11 +# CHECK-NEXT: 1 1 0.50 ubfx w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 lsr x2, x3, #63 +# CHECK-NEXT: 1 1 0.50 lsr x19, x20, #0 +# CHECK-NEXT: 1 1 0.50 lsr x9, x10, #5 +# CHECK-NEXT: 1 1 0.50 lsr w9, w10, #0 +# CHECK-NEXT: 1 1 0.50 lsr w11, w12, #31 +# CHECK-NEXT: 1 1 0.50 lsr w13, w14, #29 +# CHECK-NEXT: 1 1 0.50 ubfx xzr, xzr, #10, #11 +# CHECK-NEXT: 1 1 0.50 cbz w5, #4 +# CHECK-NEXT: 1 1 0.50 cbz x5, #0 +# CHECK-NEXT: 1 1 0.50 cbnz x2, #-4 +# CHECK-NEXT: 1 1 0.50 cbnz x26, #1048572 +# CHECK-NEXT: 1 1 0.50 cbz wzr, #0 +# CHECK-NEXT: 1 1 0.50 cbnz xzr, #0 +# CHECK-NEXT: 1 1 0.50 b.ne #4 +# CHECK-NEXT: 1 1 0.50 b.ge #1048572 +# CHECK-NEXT: 1 1 0.50 b.ge #-4 +# CHECK-NEXT: 1 1 0.50 ccmp w1, #31, #0, eq +# CHECK-NEXT: 1 1 0.50 ccmp w3, #0, #15, hs +# CHECK-NEXT: 1 1 0.50 ccmp wzr, #15, #13, hs +# CHECK-NEXT: 1 1 0.50 ccmp x9, #31, #0, le +# CHECK-NEXT: 1 1 0.50 ccmp x3, #0, #15, gt +# CHECK-NEXT: 1 1 0.50 ccmp xzr, #5, #7, ne +# CHECK-NEXT: 1 1 0.50 ccmn w1, #31, #0, eq +# CHECK-NEXT: 1 1 0.50 ccmn w3, #0, #15, hs +# CHECK-NEXT: 1 1 0.50 ccmn wzr, #15, #13, hs +# CHECK-NEXT: 1 1 0.50 ccmn x9, #31, #0, le +# CHECK-NEXT: 1 1 0.50 ccmn x3, #0, #15, gt +# CHECK-NEXT: 1 1 0.50 ccmn xzr, #5, #7, ne +# CHECK-NEXT: 1 1 0.50 ccmp w1, wzr, #0, eq +# CHECK-NEXT: 1 1 0.50 ccmp w3, w0, #15, hs +# CHECK-NEXT: 1 1 0.50 ccmp wzr, w15, #13, hs +# CHECK-NEXT: 1 1 0.50 ccmp x9, xzr, #0, le +# CHECK-NEXT: 1 1 0.50 ccmp x3, x0, #15, gt +# CHECK-NEXT: 1 1 0.50 ccmp xzr, x5, #7, ne +# CHECK-NEXT: 1 1 0.50 ccmn w1, wzr, #0, eq +# CHECK-NEXT: 1 1 0.50 ccmn w3, w0, #15, hs +# CHECK-NEXT: 1 1 0.50 ccmn wzr, w15, #13, hs +# CHECK-NEXT: 1 1 0.50 ccmn x9, xzr, #0, le +# CHECK-NEXT: 1 1 0.50 ccmn x3, x0, #15, gt +# CHECK-NEXT: 1 1 0.50 ccmn xzr, x5, #7, ne +# CHECK-NEXT: 1 1 0.50 csel w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csel wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csel w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csel w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csel x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csel xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csel x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csel x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 csinc w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csinc wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csinc w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csinc w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csinc x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csinc xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csinc x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csinc x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 csinv w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csinv wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csinv w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csinv w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csinv x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csinv xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csinv x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csinv x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 csneg w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csneg wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csneg w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csneg w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csneg x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csneg xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csneg x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csneg x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 cset w3, eq +# CHECK-NEXT: 1 1 0.50 cset x9, pl +# CHECK-NEXT: 1 1 0.50 csetm w20, ne +# CHECK-NEXT: 1 1 0.50 csetm x30, ge +# CHECK-NEXT: 1 1 0.50 csinc w2, wzr, wzr, al +# CHECK-NEXT: 1 1 0.50 csinv x3, xzr, xzr, nv +# CHECK-NEXT: 1 1 0.50 cinc w3, w5, gt +# CHECK-NEXT: 1 1 0.50 cinc wzr, w4, le +# CHECK-NEXT: 1 1 0.50 cset w9, lt +# CHECK-NEXT: 1 1 0.50 cinc x3, x5, gt +# CHECK-NEXT: 1 1 0.50 cinc xzr, x4, le +# CHECK-NEXT: 1 1 0.50 cset x9, lt +# CHECK-NEXT: 1 1 0.50 csinc w5, w6, w6, nv +# CHECK-NEXT: 1 1 0.50 csinc x1, x2, x2, al +# CHECK-NEXT: 1 1 0.50 cinv w3, w5, gt +# CHECK-NEXT: 1 1 0.50 cinv wzr, w4, le +# CHECK-NEXT: 1 1 0.50 csetm w9, lt +# CHECK-NEXT: 1 1 0.50 cinv x3, x5, gt +# CHECK-NEXT: 1 1 0.50 cinv xzr, x4, le +# CHECK-NEXT: 1 1 0.50 csetm x9, lt +# CHECK-NEXT: 1 1 0.50 csinv x1, x0, x0, al +# CHECK-NEXT: 1 1 0.50 csinv w9, w8, w8, nv +# CHECK-NEXT: 1 1 0.50 cneg w3, w5, gt +# CHECK-NEXT: 1 1 0.50 cneg wzr, w4, le +# CHECK-NEXT: 1 1 0.50 cneg w9, wzr, lt +# CHECK-NEXT: 1 1 0.50 cneg x3, x5, gt +# CHECK-NEXT: 1 1 0.50 cneg xzr, x4, le +# CHECK-NEXT: 1 1 0.50 cneg x9, xzr, lt +# CHECK-NEXT: 1 1 0.50 csneg x4, x8, x8, al +# CHECK-NEXT: 1 1 0.50 csinv w9, w8, w8, nv +# CHECK-NEXT: 1 1 0.50 rbit w0, w7 +# CHECK-NEXT: 1 1 0.50 rbit x18, x3 +# CHECK-NEXT: 1 1 0.50 rev16 w17, w1 +# CHECK-NEXT: 1 1 0.50 rev16 x5, x2 +# CHECK-NEXT: 1 1 0.50 rev w18, w0 +# CHECK-NEXT: 1 1 0.50 rev32 x20, x1 +# CHECK-NEXT: 1 1 0.50 rev x22, x2 +# CHECK-NEXT: 1 1 0.25 clz w24, w3 +# CHECK-NEXT: 1 1 0.25 clz x26, x4 +# CHECK-NEXT: 1 1 0.50 cls w3, w5 +# CHECK-NEXT: 1 1 0.50 cls x20, x5 +# CHECK-NEXT: 2 13 1.00 udiv w0, w7, w10 +# CHECK-NEXT: 3 13 2.00 udiv x9, x22, x4 +# CHECK-NEXT: 2 13 1.00 sdiv w12, w21, w0 +# CHECK-NEXT: 3 13 2.00 sdiv x13, x2, x1 +# CHECK-NEXT: 1 1 0.50 lsl w11, w12, w13 +# CHECK-NEXT: 1 1 0.50 lsl x14, x15, x16 +# CHECK-NEXT: 1 1 0.50 lsr w17, w18, w19 +# CHECK-NEXT: 1 1 0.50 lsr x20, x21, x22 +# CHECK-NEXT: 1 1 0.50 asr w23, w24, w25 +# CHECK-NEXT: 1 1 0.50 asr x26, x27, x28 +# CHECK-NEXT: 1 1 0.50 ror w0, w1, w2 +# CHECK-NEXT: 1 1 0.50 ror x3, x4, x5 +# CHECK-NEXT: 1 1 0.50 lsl w6, w7, w8 +# CHECK-NEXT: 1 1 0.50 lsl x9, x10, x11 +# CHECK-NEXT: 1 1 0.50 lsr w12, w13, w14 +# CHECK-NEXT: 1 1 0.50 lsr x15, x16, x17 +# CHECK-NEXT: 1 1 0.50 asr w18, w19, w20 +# CHECK-NEXT: 1 1 0.50 asr x21, x22, x23 +# CHECK-NEXT: 1 1 0.50 ror w24, w25, w26 +# CHECK-NEXT: 1 1 0.50 ror x27, x28, x29 +# CHECK-NEXT: 1 3 1.00 smulh x30, x29, x28 +# CHECK-NEXT: 1 3 1.00 smulh xzr, x27, x26 +# CHECK-NEXT: 1 3 1.00 umulh x30, x29, x28 +# CHECK-NEXT: 1 3 1.00 umulh x23, x30, xzr +# CHECK-NEXT: 1 3 1.00 madd w1, w3, w7, w4 +# CHECK-NEXT: 1 3 1.00 madd wzr, w0, w9, w11 +# CHECK-NEXT: 1 3 1.00 madd w13, wzr, w4, w4 +# CHECK-NEXT: 1 3 1.00 madd w19, w30, wzr, w29 +# CHECK-NEXT: 1 3 1.00 mul w4, w5, w6 +# CHECK-NEXT: 1 3 1.00 madd x1, x3, x7, x4 +# CHECK-NEXT: 1 3 1.00 madd xzr, x0, x9, x11 +# CHECK-NEXT: 1 3 1.00 madd x13, xzr, x4, x4 +# CHECK-NEXT: 1 3 1.00 madd x19, x30, xzr, x29 +# CHECK-NEXT: 1 3 1.00 mul x4, x5, x6 +# CHECK-NEXT: 1 3 1.00 msub w1, w3, w7, w4 +# CHECK-NEXT: 1 3 1.00 msub wzr, w0, w9, w11 +# CHECK-NEXT: 1 3 1.00 msub w13, wzr, w4, w4 +# CHECK-NEXT: 1 3 1.00 msub w19, w30, wzr, w29 +# CHECK-NEXT: 1 3 1.00 mneg w4, w5, w6 +# CHECK-NEXT: 1 3 1.00 msub x1, x3, x7, x4 +# CHECK-NEXT: 1 3 1.00 msub xzr, x0, x9, x11 +# CHECK-NEXT: 1 3 1.00 msub x13, xzr, x4, x4 +# CHECK-NEXT: 1 3 1.00 msub x19, x30, xzr, x29 +# CHECK-NEXT: 1 3 1.00 mneg x4, x5, x6 +# CHECK-NEXT: 2 4 1.00 smaddl x3, w5, w2, x9 +# CHECK-NEXT: 2 4 1.00 smaddl xzr, w10, w11, x12 +# CHECK-NEXT: 2 4 1.00 smaddl x13, wzr, w14, x15 +# CHECK-NEXT: 2 4 1.00 smaddl x16, w17, wzr, x18 +# CHECK-NEXT: 2 4 1.00 smull x19, w20, w21 +# CHECK-NEXT: 2 4 1.00 smsubl x3, w5, w2, x9 +# CHECK-NEXT: 2 4 1.00 smsubl xzr, w10, w11, x12 +# CHECK-NEXT: 2 4 1.00 smsubl x13, wzr, w14, x15 +# CHECK-NEXT: 2 4 1.00 smsubl x16, w17, wzr, x18 +# CHECK-NEXT: 2 4 1.00 smnegl x19, w20, w21 +# CHECK-NEXT: 2 4 1.00 umaddl x3, w5, w2, x9 +# CHECK-NEXT: 2 4 1.00 umaddl xzr, w10, w11, x12 +# CHECK-NEXT: 2 4 1.00 umaddl x13, wzr, w14, x15 +# CHECK-NEXT: 2 4 1.00 umaddl x16, w17, wzr, x18 +# CHECK-NEXT: 2 4 1.00 umull x19, w20, w21 +# CHECK-NEXT: 2 4 1.00 umsubl x3, w5, w2, x9 +# CHECK-NEXT: 2 4 1.00 umsubl x16, w17, wzr, x18 +# CHECK-NEXT: 2 4 1.00 umnegl x19, w20, w21 +# CHECK-NEXT: 1 3 1.00 smulh x30, x29, x28 +# CHECK-NEXT: 1 3 1.00 smulh x23, x22, xzr +# CHECK-NEXT: 1 3 1.00 umulh x23, x22, xzr +# CHECK-NEXT: 1 3 1.00 mul x19, x20, xzr +# CHECK-NEXT: 1 3 1.00 mneg w21, w22, w23 +# CHECK-NEXT: 2 4 1.00 smull x11, w13, w17 +# CHECK-NEXT: 2 4 1.00 umull x11, w13, w17 +# CHECK-NEXT: 2 4 1.00 smnegl x11, w13, w17 +# CHECK-NEXT: 2 4 1.00 umnegl x11, w13, w17 +# CHECK-NEXT: 1 1 0.50 extr w3, w5, w7, #0 +# CHECK-NEXT: 1 1 0.50 extr w11, w13, w17, #31 +# CHECK-NEXT: 1 1 0.50 extr x3, x5, x7, #15 +# CHECK-NEXT: 1 1 0.50 extr x11, x13, x17, #63 +# CHECK-NEXT: 1 1 0.50 ror x19, x23, #24 +# CHECK-NEXT: 1 1 0.50 ror x29, xzr, #63 +# CHECK-NEXT: 1 1 0.50 ror w9, w13, #31 +# CHECK-NEXT: 1 3 1.00 fcmp s3, s5 +# CHECK-NEXT: 1 3 1.00 fcmp s31, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmp s31, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmpe s29, s30 +# CHECK-NEXT: 1 3 1.00 fcmpe s15, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmpe s15, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmp d4, d12 +# CHECK-NEXT: 1 3 1.00 fcmp d23, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmp d23, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmpe d26, d22 +# CHECK-NEXT: 1 3 1.00 fcmpe d29, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmpe d29, #0.0 +# CHECK-NEXT: 3 9 1.00 fccmp s1, s31, #0, eq +# CHECK-NEXT: 3 9 1.00 fccmp s3, s0, #15, hs +# CHECK-NEXT: 3 9 1.00 fccmp s31, s15, #13, hs +# CHECK-NEXT: 3 9 1.00 fccmp d9, d31, #0, le +# CHECK-NEXT: 3 9 1.00 fccmp d3, d0, #15, gt +# CHECK-NEXT: 3 9 1.00 fccmp d31, d5, #7, ne +# CHECK-NEXT: 3 9 1.00 fccmpe s1, s31, #0, eq +# CHECK-NEXT: 3 9 1.00 fccmpe s3, s0, #15, hs +# CHECK-NEXT: 3 9 1.00 fccmpe s31, s15, #13, hs +# CHECK-NEXT: 3 9 1.00 fccmpe d9, d31, #0, le +# CHECK-NEXT: 3 9 1.00 fccmpe d3, d0, #15, gt +# CHECK-NEXT: 3 9 1.00 fccmpe d31, d5, #7, ne +# CHECK-NEXT: 3 9 1.00 fcsel s3, s20, s9, pl +# CHECK-NEXT: 3 9 1.00 fcsel d9, d10, d11, mi +# CHECK-NEXT: 1 2 0.50 fmov s0, s1 +# CHECK-NEXT: 1 2 0.50 fabs s2, s3 +# CHECK-NEXT: 1 2 0.50 fneg s4, s5 +# CHECK-NEXT: 1 33 1.00 fsqrt s6, s7 +# CHECK-NEXT: 1 3 0.50 fcvt d8, s9 +# CHECK-NEXT: 1 3 0.50 fcvt h10, s11 +# CHECK-NEXT: 1 2 0.50 frintn s12, s13 +# CHECK-NEXT: 1 2 0.50 frintp s14, s15 +# CHECK-NEXT: 1 2 0.50 frintm s16, s17 +# CHECK-NEXT: 1 2 0.50 frintz s18, s19 +# CHECK-NEXT: 1 2 0.50 frinta s20, s21 +# CHECK-NEXT: 1 2 0.50 frintx s22, s23 +# CHECK-NEXT: 1 2 0.50 frinti s24, s25 +# CHECK-NEXT: 1 2 0.50 fmov d0, d1 +# CHECK-NEXT: 1 2 0.50 fabs d2, d3 +# CHECK-NEXT: 1 2 0.50 fneg d4, d5 +# CHECK-NEXT: 1 63 1.00 fsqrt d6, d7 +# CHECK-NEXT: 1 3 0.50 fcvt s8, d9 +# CHECK-NEXT: 1 3 0.50 fcvt h10, d11 +# CHECK-NEXT: 1 2 0.50 frintn d12, d13 +# CHECK-NEXT: 1 2 0.50 frintp d14, d15 +# CHECK-NEXT: 1 2 0.50 frintm d16, d17 +# CHECK-NEXT: 1 2 0.50 frintz d18, d19 +# CHECK-NEXT: 1 2 0.50 frinta d20, d21 +# CHECK-NEXT: 1 2 0.50 frintx d22, d23 +# CHECK-NEXT: 1 2 0.50 frinti d24, d25 +# CHECK-NEXT: 1 3 0.50 fcvt s26, h27 +# CHECK-NEXT: 1 3 0.50 fcvt d28, h29 +# CHECK-NEXT: 1 4 0.50 fmul s20, s19, s17 +# CHECK-NEXT: 1 12 1.00 fdiv s1, s2, s3 +# CHECK-NEXT: 1 2 0.50 fadd s4, s5, s6 +# CHECK-NEXT: 1 2 0.50 fsub s7, s8, s9 +# CHECK-NEXT: 1 2 0.50 fmax s10, s11, s12 +# CHECK-NEXT: 1 2 0.50 fmin s13, s14, s15 +# CHECK-NEXT: 1 2 0.50 fmaxnm s16, s17, s18 +# CHECK-NEXT: 1 2 0.50 fminnm s19, s20, s21 +# CHECK-NEXT: 1 4 0.50 fnmul s22, s23, s2 +# CHECK-NEXT: 1 4 0.50 fmul d20, d19, d17 +# CHECK-NEXT: 1 19 1.00 fdiv d1, d2, d3 +# CHECK-NEXT: 1 2 0.50 fadd d4, d5, d6 +# CHECK-NEXT: 1 2 0.50 fsub d7, d8, d9 +# CHECK-NEXT: 1 2 0.50 fmax d10, d11, d12 +# CHECK-NEXT: 1 2 0.50 fmin d13, d14, d15 +# CHECK-NEXT: 1 2 0.50 fmaxnm d16, d17, d18 +# CHECK-NEXT: 1 2 0.50 fminnm d19, d20, d21 +# CHECK-NEXT: 1 4 0.50 fnmul d22, d23, d24 +# CHECK-NEXT: 1 4 0.50 fmadd s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fmadd d3, d13, d0, d23 +# CHECK-NEXT: 1 4 0.50 fmsub s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fmsub d3, d13, d0, d23 +# CHECK-NEXT: 1 4 0.50 fnmadd s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fnmadd d3, d13, d0, d23 +# CHECK-NEXT: 1 4 0.50 fnmsub s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fnmsub d3, d13, d0, d23 +# CHECK-NEXT: 2 7 1.00 fcvtzs w3, h5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzs wzr, h20, #13 +# CHECK-NEXT: 2 7 1.00 fcvtzs w19, h0, #32 +# CHECK-NEXT: 2 7 1.00 fcvtzs x3, h5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzs x12, h30, #45 +# CHECK-NEXT: 2 7 1.00 fcvtzs x19, h0, #64 +# CHECK-NEXT: 2 7 1.00 fcvtzs w3, s5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzs wzr, s20, #13 +# CHECK-NEXT: 2 7 1.00 fcvtzs w19, s0, #32 +# CHECK-NEXT: 2 7 1.00 fcvtzs x3, s5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzs x12, s30, #45 +# CHECK-NEXT: 2 7 1.00 fcvtzs x19, s0, #64 +# CHECK-NEXT: 2 7 1.00 fcvtzs w3, d5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzs wzr, d20, #13 +# CHECK-NEXT: 2 7 1.00 fcvtzs w19, d0, #32 +# CHECK-NEXT: 2 7 1.00 fcvtzs x3, d5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzs x12, d30, #45 +# CHECK-NEXT: 2 7 1.00 fcvtzs x19, d0, #64 +# CHECK-NEXT: 2 7 1.00 fcvtzu w3, h5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzu wzr, h20, #13 +# CHECK-NEXT: 2 7 1.00 fcvtzu w19, h0, #32 +# CHECK-NEXT: 2 7 1.00 fcvtzu x3, h5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzu x12, h30, #45 +# CHECK-NEXT: 2 7 1.00 fcvtzu x19, h0, #64 +# CHECK-NEXT: 2 7 1.00 fcvtzu w3, s5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzu wzr, s20, #13 +# CHECK-NEXT: 2 7 1.00 fcvtzu w19, s0, #32 +# CHECK-NEXT: 2 7 1.00 fcvtzu x3, s5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzu x12, s30, #45 +# CHECK-NEXT: 2 7 1.00 fcvtzu x19, s0, #64 +# CHECK-NEXT: 2 7 1.00 fcvtzu w3, d5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzu wzr, d20, #13 +# CHECK-NEXT: 2 7 1.00 fcvtzu w19, d0, #32 +# CHECK-NEXT: 2 7 1.00 fcvtzu x3, d5, #1 +# CHECK-NEXT: 2 7 1.00 fcvtzu x12, d30, #45 +# CHECK-NEXT: 2 7 1.00 fcvtzu x19, d0, #64 +# CHECK-NEXT: 3 11 1.00 scvtf h23, w19, #1 +# CHECK-NEXT: 3 11 1.00 scvtf h31, wzr, #20 +# CHECK-NEXT: 3 11 1.00 scvtf h14, w0, #32 +# CHECK-NEXT: 3 11 1.00 scvtf h23, x19, #1 +# CHECK-NEXT: 3 11 1.00 scvtf h31, xzr, #20 +# CHECK-NEXT: 3 11 1.00 scvtf h14, x0, #64 +# CHECK-NEXT: 3 11 1.00 scvtf s23, w19, #1 +# CHECK-NEXT: 3 11 1.00 scvtf s31, wzr, #20 +# CHECK-NEXT: 3 11 1.00 scvtf s14, w0, #32 +# CHECK-NEXT: 3 11 1.00 scvtf s23, x19, #1 +# CHECK-NEXT: 3 11 1.00 scvtf s31, xzr, #20 +# CHECK-NEXT: 3 11 1.00 scvtf s14, x0, #64 +# CHECK-NEXT: 3 11 1.00 scvtf d23, w19, #1 +# CHECK-NEXT: 3 11 1.00 scvtf d31, wzr, #20 +# CHECK-NEXT: 3 11 1.00 scvtf d14, w0, #32 +# CHECK-NEXT: 3 11 1.00 scvtf d23, x19, #1 +# CHECK-NEXT: 3 11 1.00 scvtf d31, xzr, #20 +# CHECK-NEXT: 3 11 1.00 scvtf d14, x0, #64 +# CHECK-NEXT: 3 11 1.00 ucvtf h23, w19, #1 +# CHECK-NEXT: 3 11 1.00 ucvtf h31, wzr, #20 +# CHECK-NEXT: 3 11 1.00 ucvtf h14, w0, #32 +# CHECK-NEXT: 3 11 1.00 ucvtf h23, x19, #1 +# CHECK-NEXT: 3 11 1.00 ucvtf h31, xzr, #20 +# CHECK-NEXT: 3 11 1.00 ucvtf h14, x0, #64 +# CHECK-NEXT: 3 11 1.00 ucvtf s23, w19, #1 +# CHECK-NEXT: 3 11 1.00 ucvtf s31, wzr, #20 +# CHECK-NEXT: 3 11 1.00 ucvtf s14, w0, #32 +# CHECK-NEXT: 3 11 1.00 ucvtf s23, x19, #1 +# CHECK-NEXT: 3 11 1.00 ucvtf s31, xzr, #20 +# CHECK-NEXT: 3 11 1.00 ucvtf s14, x0, #64 +# CHECK-NEXT: 3 11 1.00 ucvtf d23, w19, #1 +# CHECK-NEXT: 3 11 1.00 ucvtf d31, wzr, #20 +# CHECK-NEXT: 3 11 1.00 ucvtf d14, w0, #32 +# CHECK-NEXT: 3 11 1.00 ucvtf d23, x19, #1 +# CHECK-NEXT: 3 11 1.00 ucvtf d31, xzr, #20 +# CHECK-NEXT: 3 11 1.00 ucvtf d14, x0, #64 +# CHECK-NEXT: 2 7 1.00 fcvtns w3, h31 +# CHECK-NEXT: 2 7 1.00 fcvtns xzr, h12 +# CHECK-NEXT: 2 7 1.00 fcvtnu wzr, h12 +# CHECK-NEXT: 2 7 1.00 fcvtnu x0, h0 +# CHECK-NEXT: 2 7 1.00 fcvtps wzr, h9 +# CHECK-NEXT: 2 7 1.00 fcvtps x12, h20 +# CHECK-NEXT: 2 7 1.00 fcvtpu w30, h23 +# CHECK-NEXT: 2 7 1.00 fcvtpu x29, h3 +# CHECK-NEXT: 2 7 1.00 fcvtms w2, h3 +# CHECK-NEXT: 2 7 1.00 fcvtms x4, h5 +# CHECK-NEXT: 2 7 1.00 fcvtmu w6, h7 +# CHECK-NEXT: 2 7 1.00 fcvtmu x8, h9 +# CHECK-NEXT: 2 7 1.00 fcvtzs w10, h11 +# CHECK-NEXT: 2 7 1.00 fcvtzs x12, h13 +# CHECK-NEXT: 2 7 1.00 fcvtzu w14, h15 +# CHECK-NEXT: 2 7 1.00 fcvtzu x15, h16 +# CHECK-NEXT: 3 11 1.00 scvtf h17, w18 +# CHECK-NEXT: 3 11 1.00 scvtf h19, x20 +# CHECK-NEXT: 3 11 1.00 ucvtf h21, w22 +# CHECK-NEXT: 3 11 1.00 scvtf h23, x24 +# CHECK-NEXT: 2 7 1.00 fcvtas w25, h26 +# CHECK-NEXT: 2 7 1.00 fcvtas x27, h28 +# CHECK-NEXT: 2 7 1.00 fcvtau w29, h30 +# CHECK-NEXT: 2 7 1.00 fcvtau xzr, h0 +# CHECK-NEXT: 2 7 1.00 fcvtns w3, s31 +# CHECK-NEXT: 2 7 1.00 fcvtns xzr, s12 +# CHECK-NEXT: 2 7 1.00 fcvtnu wzr, s12 +# CHECK-NEXT: 2 7 1.00 fcvtnu x0, s0 +# CHECK-NEXT: 2 7 1.00 fcvtps wzr, s9 +# CHECK-NEXT: 2 7 1.00 fcvtps x12, s20 +# CHECK-NEXT: 2 7 1.00 fcvtpu w30, s23 +# CHECK-NEXT: 2 7 1.00 fcvtpu x29, s3 +# CHECK-NEXT: 2 7 1.00 fcvtms w2, s3 +# CHECK-NEXT: 2 7 1.00 fcvtms x4, s5 +# CHECK-NEXT: 2 7 1.00 fcvtmu w6, s7 +# CHECK-NEXT: 2 7 1.00 fcvtmu x8, s9 +# CHECK-NEXT: 2 7 1.00 fcvtzs w10, s11 +# CHECK-NEXT: 2 7 1.00 fcvtzs x12, s13 +# CHECK-NEXT: 2 7 1.00 fcvtzu w14, s15 +# CHECK-NEXT: 2 7 1.00 fcvtzu x15, s16 +# CHECK-NEXT: 3 11 1.00 scvtf s17, w18 +# CHECK-NEXT: 3 11 1.00 scvtf s19, x20 +# CHECK-NEXT: 3 11 1.00 ucvtf s21, w22 +# CHECK-NEXT: 3 11 1.00 scvtf s23, x24 +# CHECK-NEXT: 2 7 1.00 fcvtas w25, s26 +# CHECK-NEXT: 2 7 1.00 fcvtas x27, s28 +# CHECK-NEXT: 2 7 1.00 fcvtau w29, s30 +# CHECK-NEXT: 2 7 1.00 fcvtau xzr, s0 +# CHECK-NEXT: 2 7 1.00 fcvtns w3, d31 +# CHECK-NEXT: 2 7 1.00 fcvtns xzr, d12 +# CHECK-NEXT: 2 7 1.00 fcvtnu wzr, d12 +# CHECK-NEXT: 2 7 1.00 fcvtnu x0, d0 +# CHECK-NEXT: 2 7 1.00 fcvtps wzr, d9 +# CHECK-NEXT: 2 7 1.00 fcvtps x12, d20 +# CHECK-NEXT: 2 7 1.00 fcvtpu w30, d23 +# CHECK-NEXT: 2 7 1.00 fcvtpu x29, d3 +# CHECK-NEXT: 2 7 1.00 fcvtms w2, d3 +# CHECK-NEXT: 2 7 1.00 fcvtms x4, d5 +# CHECK-NEXT: 2 7 1.00 fcvtmu w6, d7 +# CHECK-NEXT: 2 7 1.00 fcvtmu x8, d9 +# CHECK-NEXT: 2 7 1.00 fcvtzs w10, d11 +# CHECK-NEXT: 2 7 1.00 fcvtzs x12, d13 +# CHECK-NEXT: 2 7 1.00 fcvtzu w14, d15 +# CHECK-NEXT: 2 7 1.00 fcvtzu x15, d16 +# CHECK-NEXT: 3 11 1.00 scvtf d17, w18 +# CHECK-NEXT: 3 11 1.00 scvtf d19, x20 +# CHECK-NEXT: 3 11 1.00 ucvtf d21, w22 +# CHECK-NEXT: 3 11 1.00 ucvtf d23, x24 +# CHECK-NEXT: 2 7 1.00 fcvtas w25, d26 +# CHECK-NEXT: 2 7 1.00 fcvtas x27, d28 +# CHECK-NEXT: 2 7 1.00 fcvtau w29, d30 +# CHECK-NEXT: 2 7 1.00 fcvtau xzr, d0 +# CHECK-NEXT: 1 5 1.00 fmov w3, s9 +# CHECK-NEXT: 1 3 1.00 fmov s9, w3 +# CHECK-NEXT: 1 5 1.00 fmov x20, d31 +# CHECK-NEXT: 1 3 1.00 fmov d1, x15 +# CHECK-NEXT: 2 7 1.00 fmov x3, v12.d[1] +# CHECK-NEXT: 1 5 1.00 fmov v1.d[1], x19 +# CHECK-NEXT: 1 2 0.50 fmov s2, #0.12500000 +# CHECK-NEXT: 1 2 0.50 fmov s3, #1.00000000 +# CHECK-NEXT: 1 2 0.50 fmov d30, #16.00000000 +# CHECK-NEXT: 1 2 0.50 fmov s4, #1.06250000 +# CHECK-NEXT: 1 2 0.50 fmov d10, #1.93750000 +# CHECK-NEXT: 1 2 0.50 fmov s12, #-1.00000000 +# CHECK-NEXT: 1 2 0.50 fmov d16, #8.50000000 +# CHECK-NEXT: 1 3 0.50 * ldr w3, #0 +# CHECK-NEXT: 1 3 0.50 * ldr x29, #4 +# CHECK-NEXT: 1 3 0.50 * ldrsw xzr, #-4 +# CHECK-NEXT: 1 3 0.50 * ldr s0, #8 +# CHECK-NEXT: 1 3 0.50 * ldr d0, #1048572 +# CHECK-NEXT: 1 3 0.50 * ldr q0, #-1048576 +# CHECK-NEXT: 1 1 0.50 U prfm pldl1strm, #0 +# CHECK-NEXT: 1 1 0.50 U prfm #22, #0 +# CHECK-NEXT: 2 4 0.50 * * U stxrb w18, w8, [sp] +# CHECK-NEXT: 2 4 0.50 * * U stxrh w24, w15, [x16] +# CHECK-NEXT: 2 4 0.50 * * U stxr w5, w6, [x17] +# CHECK-NEXT: 2 4 0.50 * * U stxr w1, x10, [x21] +# CHECK-NEXT: 1 3 0.50 * * U ldxrb w30, [x0] +# CHECK-NEXT: 1 3 0.50 * * U ldxrh w17, [x4] +# CHECK-NEXT: 1 3 0.50 * * U ldxr w22, [sp] +# CHECK-NEXT: 1 3 0.50 * * U ldxr x11, [x29] +# CHECK-NEXT: 1 3 0.50 * * U ldxr x11, [x29] +# CHECK-NEXT: 1 3 0.50 * * U ldxr x11, [x29] +# CHECK-NEXT: 2 4 0.50 * * U stxp w12, w11, w10, [sp] +# CHECK-NEXT: 2 4 0.50 * * U stxp wzr, x27, x9, [x12] +# CHECK-NEXT: 2 3 0.50 * * U ldxp w0, wzr, [sp] +# CHECK-NEXT: 2 3 0.50 * * U ldxp x17, x0, [x18] +# CHECK-NEXT: 2 3 0.50 * * U ldxp x17, x0, [x18] +# CHECK-NEXT: 2 4 0.50 * * U stlxrb w12, w22, [x0] +# CHECK-NEXT: 2 4 0.50 * * U stlxrh w10, w1, [x1] +# CHECK-NEXT: 2 4 0.50 * * U stlxr w9, w2, [x2] +# CHECK-NEXT: 2 4 0.50 * * U stlxr w9, x3, [sp] +# CHECK-NEXT: 1 3 0.50 * * U ldaxrb w8, [x4] +# CHECK-NEXT: 1 3 0.50 * * U ldaxrh w7, [x5] +# CHECK-NEXT: 1 3 0.50 * * U ldaxr w6, [sp] +# CHECK-NEXT: 1 3 0.50 * * U ldaxr x5, [x6] +# CHECK-NEXT: 1 3 0.50 * * U ldaxr x5, [x6] +# CHECK-NEXT: 1 3 0.50 * * U ldaxr x5, [x6] +# CHECK-NEXT: 2 4 0.50 * * U stlxp w4, w5, w6, [sp] +# CHECK-NEXT: 2 4 0.50 * * U stlxp wzr, x6, x7, [x1] +# CHECK-NEXT: 2 3 0.50 * * U ldaxp w5, w18, [sp] +# CHECK-NEXT: 2 3 0.50 * * U ldaxp x6, x19, [x22] +# CHECK-NEXT: 2 3 0.50 * * U ldaxp x6, x19, [x22] +# CHECK-NEXT: 1 1 0.50 * U stlrb w24, [sp] +# CHECK-NEXT: 1 1 0.50 * U stlrh w25, [x30] +# CHECK-NEXT: 1 1 0.50 * U stlr w26, [x29] +# CHECK-NEXT: 1 1 0.50 * U stlr x27, [x28] +# CHECK-NEXT: 1 1 0.50 * U stlr x27, [x28] +# CHECK-NEXT: 1 1 0.50 * U stlr x27, [x28] +# CHECK-NEXT: 1 3 0.50 * U ldarb w23, [sp] +# CHECK-NEXT: 1 3 0.50 * U ldarh w22, [x30] +# CHECK-NEXT: 1 3 0.50 * U ldar wzr, [x29] +# CHECK-NEXT: 1 3 0.50 * U ldar x21, [x28] +# CHECK-NEXT: 1 3 0.50 * U ldar x21, [x28] +# CHECK-NEXT: 1 3 0.50 * U ldar x21, [x28] +# CHECK-NEXT: 1 1 0.50 * sturb w9, [sp] +# CHECK-NEXT: 1 1 0.50 * sturh wzr, [x12, #255] +# CHECK-NEXT: 1 1 0.50 * stur w16, [x0, #-256] +# CHECK-NEXT: 1 1 0.50 * stur x28, [x14, #1] +# CHECK-NEXT: 1 3 0.50 * ldurb w1, [x20, #255] +# CHECK-NEXT: 1 3 0.50 * ldurh w20, [x1, #255] +# CHECK-NEXT: 1 3 0.50 * ldur w12, [sp, #255] +# CHECK-NEXT: 1 3 0.50 * ldur xzr, [x12, #255] +# CHECK-NEXT: 1 3 0.50 * ldursb x9, [x7, #-256] +# CHECK-NEXT: 1 3 0.50 * ldursh x17, [x19, #-256] +# CHECK-NEXT: 1 3 0.50 * ldursw x20, [x15, #-256] +# CHECK-NEXT: 1 1 0.50 U prfum pldl2keep, [sp, #-256] +# CHECK-NEXT: 1 3 0.50 * ldursb w19, [x1, #-256] +# CHECK-NEXT: 1 3 0.50 * ldursh w15, [x21, #-256] +# CHECK-NEXT: 2 2 1.00 * stur b0, [sp, #1] +# CHECK-NEXT: 2 2 1.00 * stur h12, [x12, #-1] +# CHECK-NEXT: 2 2 1.00 * stur s15, [x0, #255] +# CHECK-NEXT: 2 2 1.00 * stur d31, [x5, #25] +# CHECK-NEXT: 2 2 1.00 * stur q9, [x5] +# CHECK-NEXT: 1 4 0.50 * ldur b3, [sp] +# CHECK-NEXT: 1 4 0.50 * ldur h5, [x4, #-256] +# CHECK-NEXT: 1 4 0.50 * ldur s7, [x12, #-1] +# CHECK-NEXT: 1 4 0.50 * ldur d11, [x19, #4] +# CHECK-NEXT: 1 4 0.50 * ldur q13, [x1, #2] +# CHECK-NEXT: 2 1 0.50 * strb w9, [x2], #255 +# CHECK-NEXT: 2 1 0.50 * strb w10, [x3], #1 +# CHECK-NEXT: 2 1 0.50 * strb w10, [x3], #-256 +# CHECK-NEXT: 2 1 0.50 * strh w9, [x2], #255 +# CHECK-NEXT: 2 1 0.50 * strh w9, [x2], #1 +# CHECK-NEXT: 2 1 0.50 * strh w10, [x3], #-256 +# CHECK-NEXT: 2 1 0.50 * str w19, [sp], #255 +# CHECK-NEXT: 2 1 0.50 * str w20, [x30], #1 +# CHECK-NEXT: 2 1 0.50 * str w21, [x12], #-256 +# CHECK-NEXT: 2 1 0.50 * str xzr, [x9], #255 +# CHECK-NEXT: 2 1 0.50 * str x2, [x3], #1 +# CHECK-NEXT: 2 1 0.50 * str x19, [x12], #-256 +# CHECK-NEXT: 2 3 0.50 * ldrb w9, [x2], #255 +# CHECK-NEXT: 2 3 0.50 * ldrb w10, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldrb w10, [x3], #-256 +# CHECK-NEXT: 2 3 0.50 * ldrh w9, [x2], #255 +# CHECK-NEXT: 2 3 0.50 * ldrh w9, [x2], #1 +# CHECK-NEXT: 2 3 0.50 * ldrh w10, [x3], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr w19, [sp], #255 +# CHECK-NEXT: 2 3 0.50 * ldr w20, [x30], #1 +# CHECK-NEXT: 2 3 0.50 * ldr w21, [x12], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr xzr, [x9], #255 +# CHECK-NEXT: 2 3 0.50 * ldr x2, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldr x19, [x12], #-256 +# CHECK-NEXT: 2 3 0.50 * ldrsb xzr, [x9], #255 +# CHECK-NEXT: 2 3 0.50 * ldrsb x2, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldrsb x19, [x12], #-256 +# CHECK-NEXT: 2 3 0.50 * ldrsh xzr, [x9], #255 +# CHECK-NEXT: 2 3 0.50 * ldrsh x2, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldrsh x19, [x12], #-256 +# CHECK-NEXT: 2 3 0.50 * ldrsw xzr, [x9], #255 +# CHECK-NEXT: 2 3 0.50 * ldrsw x2, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldrsw x19, [x12], #-256 +# CHECK-NEXT: 2 3 0.50 * ldrsb wzr, [x9], #255 +# CHECK-NEXT: 2 3 0.50 * ldrsb w2, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldrsb w19, [x12], #-256 +# CHECK-NEXT: 2 3 0.50 * ldrsh wzr, [x9], #255 +# CHECK-NEXT: 2 3 0.50 * ldrsh w2, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldrsh w19, [x12], #-256 +# CHECK-NEXT: 2 1 0.50 * str b0, [x0], #255 +# CHECK-NEXT: 2 1 0.50 * str b3, [x3], #1 +# CHECK-NEXT: 2 1 0.50 * str b5, [sp], #-256 +# CHECK-NEXT: 2 1 0.50 * str h10, [x10], #255 +# CHECK-NEXT: 2 1 0.50 * str h13, [x23], #1 +# CHECK-NEXT: 2 1 0.50 * str h15, [sp], #-256 +# CHECK-NEXT: 2 1 0.50 * str s20, [x20], #255 +# CHECK-NEXT: 2 1 0.50 * str s23, [x23], #1 +# CHECK-NEXT: 2 1 0.50 * str s25, [x0], #-256 +# CHECK-NEXT: 2 1 0.50 * str d20, [x20], #255 +# CHECK-NEXT: 2 1 0.50 * str d23, [x23], #1 +# CHECK-NEXT: 2 1 0.50 * str d25, [x0], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr b0, [x0], #255 +# CHECK-NEXT: 2 3 0.50 * ldr b3, [x3], #1 +# CHECK-NEXT: 2 3 0.50 * ldr b5, [sp], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr h10, [x10], #255 +# CHECK-NEXT: 2 3 0.50 * ldr h13, [x23], #1 +# CHECK-NEXT: 2 3 0.50 * ldr h15, [sp], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr s20, [x20], #255 +# CHECK-NEXT: 2 3 0.50 * ldr s23, [x23], #1 +# CHECK-NEXT: 2 3 0.50 * ldr s25, [x0], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr d20, [x20], #255 +# CHECK-NEXT: 2 3 0.50 * ldr d23, [x23], #1 +# CHECK-NEXT: 2 3 0.50 * ldr d25, [x0], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr q20, [x1], #255 +# CHECK-NEXT: 2 3 0.50 * ldr q23, [x9], #1 +# CHECK-NEXT: 2 3 0.50 * ldr q25, [x20], #-256 +# CHECK-NEXT: 2 1 0.50 * str q10, [x1], #255 +# CHECK-NEXT: 2 1 0.50 * str q22, [sp], #1 +# CHECK-NEXT: 2 1 0.50 * str q21, [x20], #-256 +# CHECK-NEXT: 2 3 0.50 * ldr x3, [x4, #0]! +# CHECK-NEXT: 2 1 0.50 * strb w9, [x2, #255]! +# CHECK-NEXT: 2 1 0.50 * strb w10, [x3, #1]! +# CHECK-NEXT: 2 1 0.50 * strb w10, [x3, #-256]! +# CHECK-NEXT: 2 1 0.50 * strh w9, [x2, #255]! +# CHECK-NEXT: 2 1 0.50 * strh w9, [x2, #1]! +# CHECK-NEXT: 2 1 0.50 * strh w10, [x3, #-256]! +# CHECK-NEXT: 2 1 0.50 * str w19, [sp, #255]! +# CHECK-NEXT: 2 1 0.50 * str w20, [x30, #1]! +# CHECK-NEXT: 2 1 0.50 * str w21, [x12, #-256]! +# CHECK-NEXT: 2 1 0.50 * str xzr, [x9, #255]! +# CHECK-NEXT: 2 1 0.50 * str x2, [x3, #1]! +# CHECK-NEXT: 2 1 0.50 * str x19, [x12, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldrb w9, [x2, #255]! +# CHECK-NEXT: 2 3 0.50 * ldrb w10, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldrb w10, [x3, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldrh w9, [x2, #255]! +# CHECK-NEXT: 2 3 0.50 * ldrh w9, [x2, #1]! +# CHECK-NEXT: 2 3 0.50 * ldrh w10, [x3, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldr w19, [sp, #255]! +# CHECK-NEXT: 2 3 0.50 * ldr w20, [x30, #1]! +# CHECK-NEXT: 2 3 0.50 * ldr w21, [x12, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldr xzr, [x9, #255]! +# CHECK-NEXT: 2 3 0.50 * ldr x2, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldr x19, [x12, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldrsb xzr, [x9, #255]! +# CHECK-NEXT: 2 3 0.50 * ldrsb x2, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldrsb x19, [x12, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldrsh xzr, [x9, #255]! +# CHECK-NEXT: 2 3 0.50 * ldrsh x2, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldrsh x19, [x12, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldrsw xzr, [x9, #255]! +# CHECK-NEXT: 2 3 0.50 * ldrsw x2, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldrsw x19, [x12, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldrsb wzr, [x9, #255]! +# CHECK-NEXT: 2 3 0.50 * ldrsb w2, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldrsb w19, [x12, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldrsh wzr, [x9, #255]! +# CHECK-NEXT: 2 3 0.50 * ldrsh w2, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldrsh w19, [x12, #-256]! +# CHECK-NEXT: 2 1 0.50 * str b0, [x0, #255]! +# CHECK-NEXT: 2 1 0.50 * str b3, [x3, #1]! +# CHECK-NEXT: 2 1 0.50 * str b5, [sp, #-256]! +# CHECK-NEXT: 2 1 0.50 * str h10, [x10, #255]! +# CHECK-NEXT: 2 1 0.50 * str h13, [x23, #1]! +# CHECK-NEXT: 2 1 0.50 * str h15, [sp, #-256]! +# CHECK-NEXT: 2 1 0.50 * str s20, [x20, #255]! +# CHECK-NEXT: 2 1 0.50 * str s23, [x23, #1]! +# CHECK-NEXT: 2 1 0.50 * str s25, [x0, #-256]! +# CHECK-NEXT: 2 1 0.50 * str d20, [x20, #255]! +# CHECK-NEXT: 2 1 0.50 * str d23, [x23, #1]! +# CHECK-NEXT: 2 1 0.50 * str d25, [x0, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldr b0, [x0, #255]! +# CHECK-NEXT: 2 3 0.50 * ldr b3, [x3, #1]! +# CHECK-NEXT: 2 3 0.50 * ldr b5, [sp, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldr h10, [x10, #255]! +# CHECK-NEXT: 2 3 0.50 * ldr h13, [x23, #1]! +# CHECK-NEXT: 2 3 0.50 * ldr h15, [sp, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldr s20, [x20, #255]! +# CHECK-NEXT: 2 3 0.50 * ldr s23, [x23, #1]! +# CHECK-NEXT: 2 3 0.50 * ldr s25, [x0, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldr d20, [x20, #255]! +# CHECK-NEXT: 2 3 0.50 * ldr d23, [x23, #1]! +# CHECK-NEXT: 2 3 0.50 * ldr d25, [x0, #-256]! +# CHECK-NEXT: 2 3 0.50 * ldr q20, [x1, #255]! +# CHECK-NEXT: 2 3 0.50 * ldr q23, [x9, #1]! +# CHECK-NEXT: 2 3 0.50 * ldr q25, [x20, #-256]! +# CHECK-NEXT: 2 1 0.50 * str q10, [x1, #255]! +# CHECK-NEXT: 2 1 0.50 * str q22, [sp, #1]! +# CHECK-NEXT: 2 1 0.50 * str q21, [x20, #-256]! +# CHECK-NEXT: 1 1 0.50 * sttrb w9, [sp] +# CHECK-NEXT: 1 1 0.50 * sttrh wzr, [x12, #255] +# CHECK-NEXT: 1 1 0.50 * sttr w16, [x0, #-256] +# CHECK-NEXT: 1 1 0.50 * sttr x28, [x14, #1] +# CHECK-NEXT: 1 3 0.50 * ldtrb w1, [x20, #255] +# CHECK-NEXT: 1 3 0.50 * ldtrh w20, [x1, #255] +# CHECK-NEXT: 1 3 0.50 * ldtr w12, [sp, #255] +# CHECK-NEXT: 1 3 0.50 * ldtr xzr, [x12, #255] +# CHECK-NEXT: 1 3 0.50 * ldtrsb x9, [x7, #-256] +# CHECK-NEXT: 1 3 0.50 * ldtrsh x17, [x19, #-256] +# CHECK-NEXT: 1 3 0.50 * ldtrsw x20, [x15, #-256] +# CHECK-NEXT: 1 3 0.50 * ldtrsb w19, [x1, #-256] +# CHECK-NEXT: 1 3 0.50 * ldtrsh w15, [x21, #-256] +# CHECK-NEXT: 1 3 0.50 * ldr x4, [x29] +# CHECK-NEXT: 1 3 0.50 * ldr x30, [x12, #32760] +# CHECK-NEXT: 1 3 0.50 * ldr x20, [sp, #8] +# CHECK-NEXT: 1 3 0.50 * ldr xzr, [sp] +# CHECK-NEXT: 1 3 0.50 * ldr w2, [sp] +# CHECK-NEXT: 1 3 0.50 * ldr w17, [sp, #16380] +# CHECK-NEXT: 1 3 0.50 * ldr w13, [x2, #4] +# CHECK-NEXT: 1 3 0.50 * ldrsw x2, [x5, #4] +# CHECK-NEXT: 1 3 0.50 * ldrsw x23, [sp, #16380] +# CHECK-NEXT: 1 3 0.50 * ldrh w2, [x4] +# CHECK-NEXT: 1 3 0.50 * ldrsh w23, [x6, #8190] +# CHECK-NEXT: 1 3 0.50 * ldrsh wzr, [sp, #2] +# CHECK-NEXT: 1 3 0.50 * ldrsh x29, [x2, #2] +# CHECK-NEXT: 1 3 0.50 * ldrb w26, [x3, #121] +# CHECK-NEXT: 1 3 0.50 * ldrb w12, [x2] +# CHECK-NEXT: 1 3 0.50 * ldrsb w27, [sp, #4095] +# CHECK-NEXT: 1 3 0.50 * ldrsb xzr, [x15] +# CHECK-NEXT: 1 1 0.50 * str x30, [sp] +# CHECK-NEXT: 1 1 0.50 * str w20, [x4, #16380] +# CHECK-NEXT: 1 1 0.50 * strh w17, [sp, #8190] +# CHECK-NEXT: 1 1 0.50 * strb w23, [x3, #4095] +# CHECK-NEXT: 1 1 0.50 * strb wzr, [x2] +# CHECK-NEXT: 1 3 0.50 * ldr b31, [sp, #4095] +# CHECK-NEXT: 1 3 0.50 * ldr h20, [x2, #8190] +# CHECK-NEXT: 1 3 0.50 * ldr s10, [x19, #16380] +# CHECK-NEXT: 1 3 0.50 * ldr d3, [x10, #32760] +# CHECK-NEXT: 2 2 1.00 * str q12, [sp, #65520] +# CHECK-NEXT: 1 3 0.50 * ldrb w3, [sp, x5] +# CHECK-NEXT: 1 3 0.50 * ldrb w9, [x27, x6] +# CHECK-NEXT: 1 3 0.50 * ldrsb w10, [x30, x7] +# CHECK-NEXT: 1 3 0.50 * ldrb w11, [x29, x3, sxtx] +# CHECK-NEXT: 2 1 1.00 * strb w12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 3 0.50 * ldrb w14, [x26, w6, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldrsb w15, [x25, w7, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldrb w17, [x23, w9, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldrsb x18, [x22, w10, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldrsh w3, [sp, x5] +# CHECK-NEXT: 1 3 0.50 * ldrsh w9, [x27, x6] +# CHECK-NEXT: 1 3 0.50 * ldrh w10, [x30, x7, lsl #1] +# CHECK-NEXT: 2 1 1.00 * strh w11, [x29, x3, sxtx] +# CHECK-NEXT: 1 3 0.50 * ldrh w12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 3 0.50 * ldrsh x13, [x27, x5, sxtx #1] +# CHECK-NEXT: 1 3 0.50 * ldrh w14, [x26, w6, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldrh w15, [x25, w7, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldrsh w16, [x24, w8, uxtw #1] +# CHECK-NEXT: 1 3 0.50 * ldrh w17, [x23, w9, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldrh w18, [x22, w10, sxtw] +# CHECK-NEXT: 2 1 1.00 * strh w19, [x21, wzr, sxtw #1] +# CHECK-NEXT: 1 3 0.50 * ldr w3, [sp, x5] +# CHECK-NEXT: 1 4 0.50 * ldr s9, [x27, x6] +# CHECK-NEXT: 1 3 0.50 * ldr w10, [x30, x7, lsl #2] +# CHECK-NEXT: 1 3 0.50 * ldr w11, [x29, x3, sxtx] +# CHECK-NEXT: 2 2 1.00 * str s12, [x28, xzr, sxtx] +# CHECK-NEXT: 2 1 1.00 * str w13, [x27, x5, sxtx #2] +# CHECK-NEXT: 2 1 1.00 * str w14, [x26, w6, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr w15, [x25, w7, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr w16, [x24, w8, uxtw #2] +# CHECK-NEXT: 1 3 0.50 * ldrsw x17, [x23, w9, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldr w18, [x22, w10, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldrsw x19, [x21, wzr, sxtw #2] +# CHECK-NEXT: 1 3 0.50 * ldr x3, [sp, x5] +# CHECK-NEXT: 2 1 1.00 * str x9, [x27, x6] +# CHECK-NEXT: 1 4 0.50 * ldr d10, [x30, x7, lsl #3] +# CHECK-NEXT: 2 1 1.00 * str x11, [x29, x3, sxtx] +# CHECK-NEXT: 1 3 0.50 * ldr x12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 3 0.50 * ldr x13, [x27, x5, sxtx #3] +# CHECK-NEXT: 1 1 0.50 U prfm pldl1keep, [x26, w6, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr x15, [x25, w7, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr x16, [x24, w8, uxtw #3] +# CHECK-NEXT: 1 3 0.50 * ldr x17, [x23, w9, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldr x18, [x22, w10, sxtw] +# CHECK-NEXT: 2 2 1.00 * str d19, [x21, wzr, sxtw #3] +# CHECK-NEXT: 1 4 0.50 * ldr q3, [sp, x5] +# CHECK-NEXT: 1 4 0.50 * ldr q9, [x27, x6] +# CHECK-NEXT: 1 4 0.50 * ldr q10, [x30, x7, lsl #4] +# CHECK-NEXT: 2 2 1.00 * str q11, [x29, x3, sxtx] +# CHECK-NEXT: 2 2 1.00 * str q12, [x28, xzr, sxtx] +# CHECK-NEXT: 2 2 1.00 * str q13, [x27, x5, sxtx #4] +# CHECK-NEXT: 1 4 0.50 * ldr q14, [x26, w6, uxtw] +# CHECK-NEXT: 1 4 0.50 * ldr q15, [x25, w7, uxtw] +# CHECK-NEXT: 1 4 0.50 * ldr q16, [x24, w8, uxtw #4] +# CHECK-NEXT: 1 4 0.50 * ldr q17, [x23, w9, sxtw] +# CHECK-NEXT: 2 2 1.00 * str q18, [x22, w10, sxtw] +# CHECK-NEXT: 1 4 0.50 * ldr q19, [x21, wzr, sxtw #4] +# CHECK-NEXT: 1 3 0.50 * ldp w3, w5, [sp] +# CHECK-NEXT: 2 2 0.50 * stp wzr, w9, [sp, #252] +# CHECK-NEXT: 1 3 0.50 * ldp w2, wzr, [sp, #-256] +# CHECK-NEXT: 1 3 0.50 * ldp w9, w10, [sp, #4] +# CHECK-NEXT: 2 5 1.00 * ldpsw x9, x10, [sp, #4] +# CHECK-NEXT: 2 5 1.00 * ldpsw x9, x10, [x2, #-256] +# CHECK-NEXT: 2 5 1.00 * ldpsw x20, x30, [sp, #252] +# CHECK-NEXT: 1 3 0.50 * ldp x21, x29, [x2, #504] +# CHECK-NEXT: 1 3 0.50 * ldp x22, x23, [x3, #-512] +# CHECK-NEXT: 1 3 0.50 * ldp x24, x25, [x4, #8] +# CHECK-NEXT: 2 5 1.00 * ldp s29, s28, [sp, #252] +# CHECK-NEXT: 4 3 2.00 * stp s27, s26, [sp, #-256] +# CHECK-NEXT: 2 5 1.00 * ldp s1, s2, [x3, #44] +# CHECK-NEXT: 4 3 2.00 * stp d3, d5, [x9, #504] +# CHECK-NEXT: 4 3 2.00 * stp d7, d11, [x10, #-512] +# CHECK-NEXT: 2 5 1.00 * ldp d2, d3, [x30, #-8] +# CHECK-NEXT: 4 3 2.00 * stp q3, q5, [sp] +# CHECK-NEXT: 4 3 2.00 * stp q17, q19, [sp, #1008] +# CHECK-NEXT: 2 4 1.00 * ldp q23, q29, [x1, #-1024] +# CHECK-NEXT: 1 3 0.50 * ldp w3, w5, [sp], #0 +# CHECK-NEXT: 3 2 0.50 * stp wzr, w9, [sp], #252 +# CHECK-NEXT: 1 3 0.50 * ldp w2, wzr, [sp], #-256 +# CHECK-NEXT: 1 3 0.50 * ldp w9, w10, [sp], #4 +# CHECK-NEXT: 2 5 1.00 * ldpsw x9, x10, [sp], #4 +# CHECK-NEXT: 2 5 1.00 * ldpsw x9, x10, [x2], #-256 +# CHECK-NEXT: 2 5 1.00 * ldpsw x20, x30, [sp], #252 +# CHECK-NEXT: 1 3 0.50 * ldp x21, x29, [x2], #504 +# CHECK-NEXT: 1 3 0.50 * ldp x22, x23, [x3], #-512 +# CHECK-NEXT: 1 3 0.50 * ldp x24, x25, [x4], #8 +# CHECK-NEXT: 2 5 1.00 * ldp s29, s28, [sp], #252 +# CHECK-NEXT: 4 3 2.00 * stp s27, s26, [sp], #-256 +# CHECK-NEXT: 2 5 1.00 * ldp s1, s2, [x3], #44 +# CHECK-NEXT: 4 3 2.00 * stp d3, d5, [x9], #504 +# CHECK-NEXT: 4 3 2.00 * stp d7, d11, [x10], #-512 +# CHECK-NEXT: 2 5 1.00 * ldp d2, d3, [x30], #-8 +# CHECK-NEXT: 4 3 2.00 * stp q3, q5, [sp], #0 +# CHECK-NEXT: 4 3 2.00 * stp q17, q19, [sp], #1008 +# CHECK-NEXT: 2 4 1.00 * ldp q23, q29, [x1], #-1024 +# CHECK-NEXT: 1 3 0.50 * ldp w3, w5, [sp, #0]! +# CHECK-NEXT: 3 2 0.50 * stp wzr, w9, [sp, #252]! +# CHECK-NEXT: 1 3 0.50 * ldp w2, wzr, [sp, #-256]! +# CHECK-NEXT: 1 3 0.50 * ldp w9, w10, [sp, #4]! +# CHECK-NEXT: 2 5 1.00 * ldpsw x9, x10, [sp, #4]! +# CHECK-NEXT: 2 5 1.00 * ldpsw x9, x10, [x2, #-256]! +# CHECK-NEXT: 2 5 1.00 * ldpsw x20, x30, [sp, #252]! +# CHECK-NEXT: 1 3 0.50 * ldp x21, x29, [x2, #504]! +# CHECK-NEXT: 1 3 0.50 * ldp x22, x23, [x3, #-512]! +# CHECK-NEXT: 1 3 0.50 * ldp x24, x25, [x4, #8]! +# CHECK-NEXT: 2 5 1.00 * ldp s29, s28, [sp, #252]! +# CHECK-NEXT: 4 3 2.00 * stp s27, s26, [sp, #-256]! +# CHECK-NEXT: 2 5 1.00 * ldp s1, s2, [x3, #44]! +# CHECK-NEXT: 4 3 2.00 * stp d3, d5, [x9, #504]! +# CHECK-NEXT: 4 3 2.00 * stp d7, d11, [x10, #-512]! +# CHECK-NEXT: 2 5 1.00 * ldp d2, d3, [x30, #-8]! +# CHECK-NEXT: 4 3 2.00 * stp q3, q5, [sp, #0]! +# CHECK-NEXT: 4 3 2.00 * stp q17, q19, [sp, #1008]! +# CHECK-NEXT: 2 4 1.00 * ldp q23, q29, [x1, #-1024]! +# CHECK-NEXT: 1 3 0.50 * ldnp w3, w5, [sp] +# CHECK-NEXT: 2 1 1.00 * stnp wzr, w9, [sp, #252] +# CHECK-NEXT: 1 3 0.50 * ldnp w2, wzr, [sp, #-256] +# CHECK-NEXT: 1 3 0.50 * ldnp w9, w10, [sp, #4] +# CHECK-NEXT: 1 3 0.50 * ldnp x21, x29, [x2, #504] +# CHECK-NEXT: 1 3 0.50 * ldnp x22, x23, [x3, #-512] +# CHECK-NEXT: 1 3 0.50 * ldnp x24, x25, [x4, #8] +# CHECK-NEXT: 2 5 1.00 * ldnp s29, s28, [sp, #252] +# CHECK-NEXT: 4 3 2.00 * stnp s27, s26, [sp, #-256] +# CHECK-NEXT: 2 5 1.00 * ldnp s1, s2, [x3, #44] +# CHECK-NEXT: 4 3 2.00 * stnp d3, d5, [x9, #504] +# CHECK-NEXT: 4 3 2.00 * stnp d7, d11, [x10, #-512] +# CHECK-NEXT: 2 5 1.00 * ldnp d2, d3, [x30, #-8] +# CHECK-NEXT: 4 3 2.00 * stnp q3, q5, [sp] +# CHECK-NEXT: 4 3 2.00 * stnp q17, q19, [sp, #1008] +# CHECK-NEXT: 2 4 1.00 * ldnp q23, q29, [x1, #-1024] +# CHECK-NEXT: 1 1 0.25 mov w3, #983055 +# CHECK-NEXT: 1 1 0.25 mov x10, #-6148914691236517206 +# CHECK-NEXT: 1 1 0.25 and w12, w23, w21 +# CHECK-NEXT: 1 1 0.25 and w16, w15, w1, lsl #1 +# CHECK-NEXT: 2 2 0.50 and w9, w4, w10, lsl #31 +# CHECK-NEXT: 1 1 0.25 and w3, w30, w11 +# CHECK-NEXT: 2 2 0.50 and x3, x5, x7, lsl #63 +# CHECK-NEXT: 2 2 0.50 and x5, x14, x19, asr #4 +# CHECK-NEXT: 2 2 0.50 and w3, w17, w19, ror #31 +# CHECK-NEXT: 2 2 0.50 and w0, w2, wzr, lsr #17 +# CHECK-NEXT: 2 2 0.50 and w3, w30, w11, asr #2 +# CHECK-NEXT: 1 1 0.25 and xzr, x4, x26 +# CHECK-NEXT: 2 2 0.50 and w3, wzr, w20, ror #2 +# CHECK-NEXT: 2 2 0.50 and x7, x20, xzr, asr #63 +# CHECK-NEXT: 2 2 0.50 bic x13, x20, x14, lsl #47 +# CHECK-NEXT: 1 1 0.25 bic w2, w7, w9 +# CHECK-NEXT: 2 2 0.50 orr w2, w7, w0, asr #31 +# CHECK-NEXT: 2 2 0.50 orr x8, x9, x10, lsl #12 +# CHECK-NEXT: 2 2 0.50 orn x3, x5, x7, asr #2 +# CHECK-NEXT: 1 1 0.25 orn w2, w5, w29 +# CHECK-NEXT: 1 1 0.25 ands w7, wzr, w9, lsl #1 +# CHECK-NEXT: 2 2 0.50 ands x3, x5, x20, ror #63 +# CHECK-NEXT: 1 1 0.25 bics w3, w5, w7 +# CHECK-NEXT: 1 1 0.25 bics x3, xzr, x3, lsl #1 +# CHECK-NEXT: 2 2 0.50 tst w3, w7, lsl #31 +# CHECK-NEXT: 2 2 0.50 tst x2, x20, asr #2 +# CHECK-NEXT: 1 1 0.25 mov x3, x6 +# CHECK-NEXT: 1 1 0.25 mov x3, xzr +# CHECK-NEXT: 1 1 0.25 mov wzr, w2 +# CHECK-NEXT: 1 1 0.25 mov w3, w5 +# CHECK-NEXT: 1 1 0.25 movz w2, #0, lsl #16 +# CHECK-NEXT: 1 1 0.25 mov w2, #-1235 +# CHECK-NEXT: 1 1 0.25 mov x2, #5299989643264 +# CHECK-NEXT: 1 1 0.25 mov x2, #0 +# CHECK-NEXT: 1 1 0.25 movk w3, #0 +# CHECK-NEXT: 1 1 0.25 movz x4, #0, lsl #16 +# CHECK-NEXT: 1 1 0.25 movk w5, #0, lsl #16 +# CHECK-NEXT: 1 1 0.25 movz x6, #0, lsl #32 +# CHECK-NEXT: 1 1 0.25 movk x7, #0, lsl #32 +# CHECK-NEXT: 1 1 0.25 movz x8, #0, lsl #48 +# CHECK-NEXT: 1 1 0.25 movk x9, #0, lsl #48 +# CHECK-NEXT: 1 1 0.50 adr x2, #1600 +# CHECK-NEXT: 1 1 0.50 adrp x21, #6553600 +# CHECK-NEXT: 1 1 0.50 adr x0, #262144 +# CHECK-NEXT: 1 1 0.50 tbz x12, #62, #0 +# CHECK-NEXT: 1 1 0.50 tbz x12, #62, #4 +# CHECK-NEXT: 1 1 0.50 tbz x12, #62, #-32768 +# CHECK-NEXT: 1 1 0.50 tbnz x12, #60, #32764 +# CHECK-NEXT: 1 1 0.50 b #4 +# CHECK-NEXT: 1 1 0.50 b #-4 +# CHECK-NEXT: 1 1 0.50 b #134217724 +# CHECK-NEXT: 1 1 1.00 br x20 +# CHECK-NEXT: 2 1 1.00 blr xzr +# CHECK-NEXT: 1 1 0.50 U ret x10 +# CHECK-NEXT: 1 1 0.50 U ret +# CHECK-NEXT: 1 1 1.00 U eret +# CHECK-NEXT: 1 1 1.00 U drps + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - Ampere1BUnitA +# CHECK-NEXT: [0.1] - Ampere1BUnitA +# CHECK-NEXT: [1.0] - Ampere1BUnitB +# CHECK-NEXT: [1.1] - Ampere1BUnitB +# CHECK-NEXT: [2] - Ampere1BUnitBS +# CHECK-NEXT: [3.0] - Ampere1BUnitL +# CHECK-NEXT: [3.1] - Ampere1BUnitL +# CHECK-NEXT: [4.0] - Ampere1BUnitS +# CHECK-NEXT: [4.1] - Ampere1BUnitS +# CHECK-NEXT: [5] - Ampere1BUnitX +# CHECK-NEXT: [6] - Ampere1BUnitY +# CHECK-NEXT: [7] - Ampere1BUnitZ + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] +# CHECK-NEXT: 190.00 190.00 211.00 211.00 143.00 130.50 130.50 83.00 83.00 159.00 126.00 150.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] Instructions: +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w2, w3, #4095 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w30, w29, #1, lsl #12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w13, w5, #4095, lsl #12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x5, x7, #1638 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w20, wsp, #801 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add wsp, wsp, #1104 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add wsp, w30, #4084 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x0, x24, #291 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x3, x24, #4095, lsl #12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x8, sp, #1074 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add sp, x29, #3816 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub w0, wsp, #4077 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub w4, w20, #546, lsl #12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub sp, sp, #288 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub wsp, w19, #16 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adds w13, w23, #291, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cmn w2, #4095 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adds w20, wsp, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cmn x3, #1, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cmp sp, #20, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cmp x30, #4095 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - subs x4, sp, #3822 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cmn w3, #291, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cmn wsp, #1365 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cmn sp, #1092, lsl #12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov sp, x30 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov wsp, w20 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov x11, sp +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov w24, wsp +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w3, w5, w7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add wzr, w3, w5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w20, wzr, w4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w4, w6, wzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add w11, w13, w15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w9, w3, wzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w2, w3, w4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w5, w6, w7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add w8, w9, w10, asr #31 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x3, x5, x7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add xzr, x3, x5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x20, xzr, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x4, x6, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - add x11, x13, x15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x17, x29, x20, lsl #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x2, x3, x4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x5, x6, x7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - add x8, x9, x10, asr #63 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds w3, w5, w7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn w3, w5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds w20, wzr, w4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds w4, w6, wzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds w11, w13, w15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w9, w3, wzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w2, w3, w4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w5, w6, w7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds w8, w9, w10, asr #31 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds x3, x5, x7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn x3, x5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds x20, xzr, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds x4, x6, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - adds x11, x13, x15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x17, x29, x20, lsl #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x2, x3, x4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x5, x6, x7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - adds x8, x9, x10, asr #63 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub w3, w5, w7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub wzr, w3, w5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub w4, w6, wzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub w11, w13, w15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w9, w3, wzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w2, w3, w4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w5, w6, w7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub w8, w9, w10, asr #31 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub x3, x5, x7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub xzr, x3, x5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub x4, x6, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - sub x11, x13, x15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x17, x29, x20, lsl #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x2, x3, x4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x5, x6, x7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - sub x8, x9, x10, asr #63 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subs w3, w5, w7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp w3, w5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subs w4, w6, wzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subs w11, w13, w15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w9, w3, wzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w2, w3, w4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w5, w6, w7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs w8, w9, w10, asr #31 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subs x3, x5, x7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp x3, x5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subs x4, x6, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subs x11, x13, x15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x17, x29, x20, lsl #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x2, x3, x4, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x5, x6, x7, asr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - subs x8, x9, x10, asr #63 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn wzr, w4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn w5, wzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn w6, w7 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w8, w9, lsl #15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w10, w11, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w12, w13, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w14, w15, lsr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w16, w17, lsr #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w18, w19, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w20, w21, asr #22 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn w22, w23, asr #31 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn x0, x3 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn xzr, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn x5, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmn x6, x7 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x8, x9, lsl #15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x10, x11, lsl #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x12, x13, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x14, x15, lsr #41 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x16, x17, lsr #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x18, x19, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x20, x21, asr #55 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmn x22, x23, asr #63 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp w0, w3 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp wzr, w4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp w5, wzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp w6, w7 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp w8, w9, lsl #15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp w10, w11, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp w12, w13, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp w14, w15, lsr #21 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp w18, w19, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp w20, w21, asr #22 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp w22, w23, asr #31 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp x0, x3 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp xzr, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp x5, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp x6, x7 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x8, x9, lsl #15 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x10, x11, lsl #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x12, x13, lsr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x14, x15, lsr #41 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x16, x17, lsr #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x18, x19, asr #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x20, x21, asr #55 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - cmp x22, x23, asr #63 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp wzr, w0 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - cmp xzr, x0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc w29, w27, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc wzr, w3, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc w9, wzr, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc w20, w0, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc x29, x27, x25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc xzr, x3, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc x9, xzr, x10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adc x20, x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs w29, w27, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs wzr, w3, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs w9, wzr, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs w20, w0, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs x29, x27, x25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs xzr, x3, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs x9, xzr, x10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adcs x20, x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbc w29, w27, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbc wzr, w3, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc w9, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbc w20, w0, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbc x29, x27, x25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbc xzr, x3, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc x9, x10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbc x20, x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbcs w29, w27, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbcs wzr, w3, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs w9, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbcs w20, w0, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbcs x29, x27, x25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbcs xzr, x3, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs x9, x10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - sbcs x20, x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc w3, w12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc wzr, w9 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc w23, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc x29, x30 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc xzr, x0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngc x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs w3, w12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs wzr, w9 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs w23, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs x29, x30 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs xzr, x0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ngcs x0, xzr +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfx x1, x2, #3, #2 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x3, x4, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr wzr, wzr, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfx w12, w9, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfiz x4, x5, #52, #11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfx xzr, x4, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfiz x4, xzr, #1, #6 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x5, x6, #12 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfi x4, x5, #52, #11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil xzr, x4, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfc x4, #1, #6 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil x5, x6, #12, #52 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sxtb w1, w2 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sxtb xzr, w3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sxth w9, w10 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sxth x0, w1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sxtw x3, w30 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - uxtb w1, w2 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - uxth w9, w10 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfx x3, x30, #0, #32 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w3, w2, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w9, w10, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x20, x21, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w1, wzr, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w3, w2, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w9, w10, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x20, x21, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr wzr, wzr, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w3, w2, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl w9, w10, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl x20, x21, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl w1, wzr, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfx w9, w10, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfiz x2, x3, #63, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x19, x20, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfiz x9, x10, #5, #59 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w9, w10, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfiz w11, w12, #31, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfiz w13, w14, #29, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfiz xzr, xzr, #10, #11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfx w9, w10, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x2, x3, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x19, x20, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x9, x10, #5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w9, w10, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w11, w12, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w13, w14, #29 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - sbfx xzr, xzr, #10, #11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil w9, w10, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfi x2, x3, #63, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil x19, x20, #0, #64 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfi x9, x10, #5, #59 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil w9, w10, #0, #32 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfi w11, w12, #31, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfi w13, w14, #29, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfc xzr, #10, #11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil w9, w10, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil x2, x3, #63, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil x19, x20, #0, #64 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil x9, x10, #5, #59 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil w9, w10, #0, #32 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil w11, w12, #31, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil w13, w14, #29, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - bfxil xzr, xzr, #10, #11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfx w9, w10, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl x2, x3, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x19, x20, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl x9, x10, #5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w9, w10, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl w11, w12, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl w13, w14, #29 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfiz xzr, xzr, #10, #11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfx w9, w10, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x2, x3, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x19, x20, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x9, x10, #5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w9, w10, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w11, w12, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w13, w14, #29 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ubfx xzr, xzr, #10, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cbz w5, #4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cbz x5, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cbnz x2, #-4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cbnz x26, #1048572 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cbz wzr, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cbnz xzr, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - b.ne #4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - b.ge #1048572 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - b.ge #-4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp w1, #31, #0, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp w3, #0, #15, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp wzr, #15, #13, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp x9, #31, #0, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp x3, #0, #15, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp xzr, #5, #7, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn w1, #31, #0, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn w3, #0, #15, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn wzr, #15, #13, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn x9, #31, #0, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn x3, #0, #15, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn xzr, #5, #7, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp w1, wzr, #0, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp w3, w0, #15, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp wzr, w15, #13, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp x9, xzr, #0, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp x3, x0, #15, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmp xzr, x5, #7, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn w1, wzr, #0, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn w3, w0, #15, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn wzr, w15, #13, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn x9, xzr, #0, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn x3, x0, #15, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ccmn xzr, x5, #7, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csel x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cset w3, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cset x9, pl +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csetm w20, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csetm x30, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc w2, wzr, wzr, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv x3, xzr, xzr, nv +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinc w3, w5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinc wzr, w4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cset w9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinc x3, x5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinc xzr, x4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cset x9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc w5, w6, w6, nv +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinc x1, x2, x2, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinv w3, w5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinv wzr, w4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csetm w9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinv x3, x5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cinv xzr, x4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csetm x9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv x1, x0, x0, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv w9, w8, w8, nv +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cneg w3, w5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cneg wzr, w4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cneg w9, wzr, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cneg x3, x5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cneg xzr, x4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - cneg x9, xzr, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csneg x4, x8, x8, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - csinv w9, w8, w8, nv +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - rbit w0, w7 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - rbit x18, x3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - rev16 w17, w1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - rev16 x5, x2 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - rev w18, w0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - rev32 x20, x1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - rev x22, x2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - clz w24, w3 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - clz x26, x4 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - cls w3, w5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - cls x20, x5 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 - - udiv w0, w7, w10 +# CHECK-NEXT: - - - - 2.00 - - - - 1.00 - - udiv x9, x22, x4 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 - - sdiv w12, w21, w0 +# CHECK-NEXT: - - - - 2.00 - - - - 1.00 - - sdiv x13, x2, x1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl w11, w12, w13 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl x14, x15, x16 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w17, w18, w19 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x20, x21, x22 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w23, w24, w25 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x26, x27, x28 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ror w0, w1, w2 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ror x3, x4, x5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl w6, w7, w8 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsl x9, x10, x11 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr w12, w13, w14 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - lsr x15, x16, x17 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr w18, w19, w20 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - asr x21, x22, x23 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ror w24, w25, w26 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ror x27, x28, x29 +# CHECK-NEXT: - - - - 1.00 - - - - - - - smulh x30, x29, x28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - smulh xzr, x27, x26 +# CHECK-NEXT: - - - - 1.00 - - - - - - - umulh x30, x29, x28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - umulh x23, x30, xzr +# CHECK-NEXT: - - - - 1.00 - - - - - - - madd w1, w3, w7, w4 +# CHECK-NEXT: - - - - 1.00 - - - - - - - madd wzr, w0, w9, w11 +# CHECK-NEXT: - - - - 1.00 - - - - - - - madd w13, wzr, w4, w4 +# CHECK-NEXT: - - - - 1.00 - - - - - - - madd w19, w30, wzr, w29 +# CHECK-NEXT: - - - - 1.00 - - - - - - - mul w4, w5, w6 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - madd x1, x3, x7, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - madd xzr, x0, x9, x11 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - madd x13, xzr, x4, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - madd x19, x30, xzr, x29 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - mul x4, x5, x6 +# CHECK-NEXT: - - - - 1.00 - - - - - - - msub w1, w3, w7, w4 +# CHECK-NEXT: - - - - 1.00 - - - - - - - msub wzr, w0, w9, w11 +# CHECK-NEXT: - - - - 1.00 - - - - - - - msub w13, wzr, w4, w4 +# CHECK-NEXT: - - - - 1.00 - - - - - - - msub w19, w30, wzr, w29 +# CHECK-NEXT: - - - - 1.00 - - - - - - - mneg w4, w5, w6 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - msub x1, x3, x7, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - msub xzr, x0, x9, x11 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - msub x13, xzr, x4, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - msub x19, x30, xzr, x29 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - mneg x4, x5, x6 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smaddl x3, w5, w2, x9 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smaddl xzr, w10, w11, x12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smaddl x13, wzr, w14, x15 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smaddl x16, w17, wzr, x18 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smull x19, w20, w21 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smsubl x3, w5, w2, x9 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smsubl xzr, w10, w11, x12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smsubl x13, wzr, w14, x15 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smsubl x16, w17, wzr, x18 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smnegl x19, w20, w21 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umaddl x3, w5, w2, x9 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umaddl xzr, w10, w11, x12 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umaddl x13, wzr, w14, x15 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umaddl x16, w17, wzr, x18 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umull x19, w20, w21 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umsubl x3, w5, w2, x9 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umsubl x16, w17, wzr, x18 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umnegl x19, w20, w21 +# CHECK-NEXT: - - - - 1.00 - - - - - - - smulh x30, x29, x28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - smulh x23, x22, xzr +# CHECK-NEXT: - - - - 1.00 - - - - - - - umulh x23, x22, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - mul x19, x20, xzr +# CHECK-NEXT: - - - - 1.00 - - - - - - - mneg w21, w22, w23 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smull x11, w13, w17 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umull x11, w13, w17 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - smnegl x11, w13, w17 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 1.00 - - - - - - - umnegl x11, w13, w17 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - extr w3, w5, w7, #0 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - extr w11, w13, w17, #31 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - extr x3, x5, x7, #15 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - extr x11, x13, x17, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ror x19, x23, #24 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ror x29, xzr, #63 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ror w9, w13, #31 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmp s3, s5 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmp s31, #0.0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmp s31, #0.0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmpe s29, s30 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmpe s15, #0.0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmpe s15, #0.0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmp d4, d12 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmp d23, #0.0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmp d23, #0.0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmpe d26, d22 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmpe d29, #0.0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fcmpe d29, #0.0 +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmp s1, s31, #0, eq +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmp s3, s0, #15, hs +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmp s31, s15, #13, hs +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmp d9, d31, #0, le +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmp d3, d0, #15, gt +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmp d31, d5, #7, ne +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmpe s1, s31, #0, eq +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmpe s3, s0, #15, hs +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmpe s31, s15, #13, hs +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmpe d9, d31, #0, le +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmpe d3, d0, #15, gt +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 1.00 - - fccmpe d31, d5, #7, ne +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 0.50 0.50 - fcsel s3, s20, s9, pl +# CHECK-NEXT: 0.50 0.50 - - 1.00 - - - - 0.50 0.50 - fcsel d9, d10, d11, mi +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov s0, s1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabs s2, s3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fneg s4, s5 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fsqrt s6, s7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvt d8, s9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvt h10, s11 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintn s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintp s14, s15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintm s16, s17 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintz s18, s19 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinta s20, s21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintx s22, s23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinti s24, s25 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov d0, d1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabs d2, d3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fneg d4, d5 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fsqrt d6, d7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvt s8, d9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvt h10, d11 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintn d12, d13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintp d14, d15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintm d16, d17 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintz d18, d19 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinta d20, d21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintx d22, d23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinti d24, d25 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvt s26, h27 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvt d28, h29 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmul s20, s19, s17 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fdiv s1, s2, s3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fadd s4, s5, s6 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fsub s7, s8, s9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmax s10, s11, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmin s13, s14, s15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnm s16, s17, s18 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnm s19, s20, s21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fnmul s22, s23, s2 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmul d20, d19, d17 +# CHECK-NEXT: - - - - - - - - - 1.00 - - fdiv d1, d2, d3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fadd d4, d5, d6 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fsub d7, d8, d9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmax d10, d11, d12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmin d13, d14, d15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnm d16, d17, d18 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnm d19, d20, d21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fnmul d22, d23, d24 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmadd s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmadd d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmsub s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmsub d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fnmadd s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fnmadd d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fnmsub s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fnmsub d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w3, h5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs wzr, h20, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w19, h0, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x3, h5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x12, h30, #45 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x19, h0, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w3, s5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs wzr, s20, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w19, s0, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x3, s5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x12, s30, #45 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x19, s0, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w3, d5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs wzr, d20, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w19, d0, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x3, d5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x12, d30, #45 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x19, d0, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w3, h5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu wzr, h20, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w19, h0, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x3, h5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x12, h30, #45 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x19, h0, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w3, s5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu wzr, s20, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w19, s0, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x3, s5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x12, s30, #45 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x19, s0, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w3, d5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu wzr, d20, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w19, d0, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x3, d5, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x12, d30, #45 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x19, d0, #64 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h23, w19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h31, wzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h14, w0, #32 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h23, x19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h31, xzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h14, x0, #64 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s23, w19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s31, wzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s14, w0, #32 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s23, x19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s31, xzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s14, x0, #64 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d23, w19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d31, wzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d14, w0, #32 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d23, x19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d31, xzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d14, x0, #64 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf h23, w19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf h31, wzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf h14, w0, #32 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf h23, x19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf h31, xzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf h14, x0, #64 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf s23, w19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf s31, wzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf s14, w0, #32 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf s23, x19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf s31, xzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf s14, x0, #64 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d23, w19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d31, wzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d14, w0, #32 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d23, x19, #1 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d31, xzr, #20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d14, x0, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtns w3, h31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtns xzr, h12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtnu wzr, h12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtnu x0, h0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtps wzr, h9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtps x12, h20 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtpu w30, h23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtpu x29, h3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtms w2, h3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtms x4, h5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtmu w6, h7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtmu x8, h9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w10, h11 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x12, h13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w14, h15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x15, h16 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h17, w18 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h19, x20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf h21, w22 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf h23, x24 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtas w25, h26 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtas x27, h28 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtau w29, h30 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtau xzr, h0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtns w3, s31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtns xzr, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtnu wzr, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtnu x0, s0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtps wzr, s9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtps x12, s20 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtpu w30, s23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtpu x29, s3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtms w2, s3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtms x4, s5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtmu w6, s7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtmu x8, s9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w10, s11 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w14, s15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x15, s16 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s17, w18 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s19, x20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf s21, w22 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf s23, x24 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtas w25, s26 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtas x27, s28 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtau w29, s30 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtau xzr, s0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtns w3, d31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtns xzr, d12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtnu wzr, d12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtnu x0, d0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtps wzr, d9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtps x12, d20 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtpu w30, d23 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtpu x29, d3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtms w2, d3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtms x4, d5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtmu w6, d7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtmu x8, d9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs w10, d11 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzs x12, d13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu w14, d15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtzu x15, d16 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d17, w18 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - scvtf d19, x20 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d21, w22 +# CHECK-NEXT: - - - - 1.00 - - - - 1.00 1.00 - ucvtf d23, x24 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtas w25, d26 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtas x27, d28 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtau w29, d30 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 1.00 fcvtau xzr, d0 +# CHECK-NEXT: - - - - 1.00 - - - - - - - fmov w3, s9 +# CHECK-NEXT: - - - - - - - - - - - 1.00 fmov s9, w3 +# CHECK-NEXT: - - - - 1.00 - - - - - - - fmov x20, d31 +# CHECK-NEXT: - - - - - - - - - - - 1.00 fmov d1, x15 +# CHECK-NEXT: - - - - 1.00 - - - - 0.50 0.50 - fmov x3, v12.d[1] +# CHECK-NEXT: - - - - - - - - - 1.00 - - fmov v1.d[1], x19 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov s2, #0.12500000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov s3, #1.00000000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov d30, #16.00000000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov s4, #1.06250000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov d10, #1.93750000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov s12, #-1.00000000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov d16, #8.50000000 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w3, #0 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x29, #4 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsw xzr, #-4 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr s0, #8 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr d0, #1048572 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q0, #-1048576 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - prfm pldl1strm, #0 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - prfm #22, #0 +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stxrb w18, w8, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stxrh w24, w15, [x16] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stxr w5, w6, [x17] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stxr w1, x10, [x21] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxrb w30, [x0] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxrh w17, [x4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxr w22, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxr x11, [x29] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxr x11, [x29] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxr x11, [x29] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stxp w12, w11, w10, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stxp wzr, x27, x9, [x12] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxp w0, wzr, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxp x17, x0, [x18] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldxp x17, x0, [x18] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stlxrb w12, w22, [x0] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stlxrh w10, w1, [x1] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stlxr w9, w2, [x2] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stlxr w9, x3, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxrb w8, [x4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxrh w7, [x5] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxr w6, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxr x5, [x6] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxr x5, [x6] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxr x5, [x6] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stlxp w4, w5, w6, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 0.50 0.50 - - - stlxp wzr, x6, x7, [x1] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxp w5, w18, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxp x6, x19, [x22] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldaxp x6, x19, [x22] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stlrb w24, [sp] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stlrh w25, [x30] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stlr w26, [x29] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stlr x27, [x28] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stlr x27, [x28] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stlr x27, [x28] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldarb w23, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldarh w22, [x30] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldar wzr, [x29] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldar x21, [x28] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldar x21, [x28] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldar x21, [x28] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - sturb w9, [sp] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - sturh wzr, [x12, #255] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stur w16, [x0, #-256] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stur x28, [x14, #1] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldurb w1, [x20, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldurh w20, [x1, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldur w12, [sp, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldur xzr, [x12, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldursb x9, [x7, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldursh x17, [x19, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldursw x20, [x15, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - prfum pldl2keep, [sp, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldursb w19, [x1, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldursh w15, [x21, #-256] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 stur b0, [sp, #1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 stur h12, [x12, #-1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 stur s15, [x0, #255] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 stur d31, [x5, #25] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 stur q9, [x5] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldur b3, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldur h5, [x4, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldur s7, [x12, #-1] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldur d11, [x19, #4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldur q13, [x1, #2] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strb w9, [x2], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strb w10, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strb w10, [x3], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strh w9, [x2], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strh w9, [x2], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strh w10, [x3], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str w19, [sp], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str w20, [x30], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str w21, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str xzr, [x9], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str x2, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str x19, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrb w9, [x2], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrb w10, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrb w10, [x3], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrh w9, [x2], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrh w9, [x2], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrh w10, [x3], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr w19, [sp], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr w20, [x30], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr w21, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr xzr, [x9], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr x2, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr x19, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb xzr, [x9], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb x2, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb x19, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh xzr, [x9], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh x2, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh x19, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsw xzr, [x9], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsw x2, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsw x19, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb wzr, [x9], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb w2, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb w19, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh wzr, [x9], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh w2, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh w19, [x12], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str b0, [x0], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str b3, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str b5, [sp], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str h10, [x10], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str h13, [x23], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str h15, [sp], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str s20, [x20], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str s23, [x23], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str s25, [x0], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str d20, [x20], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str d23, [x23], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str d25, [x0], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr b0, [x0], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr b3, [x3], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr b5, [sp], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr h10, [x10], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr h13, [x23], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr h15, [sp], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr s20, [x20], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr s23, [x23], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr s25, [x0], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr d20, [x20], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr d23, [x23], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr d25, [x0], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr q20, [x1], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr q23, [x9], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr q25, [x20], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str q10, [x1], #255 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str q22, [sp], #1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str q21, [x20], #-256 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr x3, [x4, #0]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strb w9, [x2, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strb w10, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strb w10, [x3, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strh w9, [x2, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strh w9, [x2, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - strh w10, [x3, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str w19, [sp, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str w20, [x30, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str w21, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str xzr, [x9, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str x2, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str x19, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrb w9, [x2, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrb w10, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrb w10, [x3, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrh w9, [x2, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrh w9, [x2, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrh w10, [x3, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr w19, [sp, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr w20, [x30, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr w21, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr xzr, [x9, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr x2, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr x19, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb xzr, [x9, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb x2, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb x19, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh xzr, [x9, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh x2, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh x19, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsw xzr, [x9, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsw x2, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsw x19, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb wzr, [x9, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb w2, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsb w19, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh wzr, [x9, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh w2, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldrsh w19, [x12, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str b0, [x0, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str b3, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str b5, [sp, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str h10, [x10, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str h13, [x23, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str h15, [sp, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str s20, [x20, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str s23, [x23, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str s25, [x0, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str d20, [x20, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str d23, [x23, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str d25, [x0, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr b0, [x0, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr b3, [x3, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr b5, [sp, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr h10, [x10, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr h13, [x23, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr h15, [sp, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr s20, [x20, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr s23, [x23, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr s25, [x0, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr d20, [x20, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr d23, [x23, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr d25, [x0, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr q20, [x1, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr q23, [x9, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - 0.50 0.50 - - - - - ldr q25, [x20, #-256]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str q10, [x1, #255]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str q22, [sp, #1]! +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - str q21, [x20, #-256]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - sttrb w9, [sp] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - sttrh wzr, [x12, #255] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - sttr w16, [x0, #-256] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - sttr x28, [x14, #1] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtrb w1, [x20, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtrh w20, [x1, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtr w12, [sp, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtr xzr, [x12, #255] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtrsb x9, [x7, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtrsh x17, [x19, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtrsw x20, [x15, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtrsb w19, [x1, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldtrsh w15, [x21, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x4, [x29] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x30, [x12, #32760] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x20, [sp, #8] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr xzr, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w2, [sp] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w17, [sp, #16380] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w13, [x2, #4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsw x2, [x5, #4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsw x23, [sp, #16380] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrh w2, [x4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsh w23, [x6, #8190] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsh wzr, [sp, #2] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsh x29, [x2, #2] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrb w26, [x3, #121] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrb w12, [x2] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsb w27, [sp, #4095] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsb xzr, [x15] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - str x30, [sp] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - str w20, [x4, #16380] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - strh w17, [sp, #8190] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - strb w23, [x3, #4095] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - strb wzr, [x2] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr b31, [sp, #4095] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr h20, [x2, #8190] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr s10, [x19, #16380] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr d3, [x10, #32760] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 str q12, [sp, #65520] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrb w3, [sp, x5] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrb w9, [x27, x6] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrsb w10, [x30, x7] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrb w11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - strb w12, [x28, xzr, sxtx] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrb w14, [x26, w6, uxtw] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrsb w15, [x25, w7, uxtw] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrb w17, [x23, w9, sxtw] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrsb x18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsh w3, [sp, x5] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsh w9, [x27, x6] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrh w10, [x30, x7, lsl #1] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - strh w11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrh w12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsh x13, [x27, x5, sxtx #1] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrh w14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrh w15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrsh w16, [x24, w8, uxtw #1] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrh w17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldrh w18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - strh w19, [x21, wzr, sxtw #1] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w3, [sp, x5] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr s9, [x27, x6] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w10, [x30, x7, lsl #2] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 str s12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - str w13, [x27, x5, sxtx #2] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - str w14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w16, [x24, w8, uxtw #2] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrsw x17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr w18, [x22, w10, sxtw] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - 0.50 0.50 - - - ldrsw x19, [x21, wzr, sxtw #2] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x3, [sp, x5] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - str x9, [x27, x6] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr d10, [x30, x7, lsl #3] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - str x11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x13, [x27, x5, sxtx #3] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - prfm pldl1keep, [x26, w6, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x16, [x24, w8, uxtw #3] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr x18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 str d19, [x21, wzr, sxtw #3] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q3, [sp, x5] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q9, [x27, x6] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q10, [x30, x7, lsl #4] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 str q11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 str q12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 str q13, [x27, x5, sxtx #4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q16, [x24, w8, uxtw #4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 str q18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldr q19, [x21, wzr, sxtw #4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w3, w5, [sp] +# CHECK-NEXT: - - 0.50 0.50 - - - 0.50 0.50 - - - stp wzr, w9, [sp, #252] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w2, wzr, [sp, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w9, w10, [sp, #4] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x9, x10, [sp, #4] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x9, x10, [x2, #-256] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x20, x30, [sp, #252] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x21, x29, [x2, #504] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x22, x23, [x3, #-512] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x24, x25, [x4, #8] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp s29, s28, [sp, #252] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp s27, s26, [sp, #-256] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp s1, s2, [x3, #44] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp d3, d5, [x9, #504] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp d7, d11, [x10, #-512] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp d2, d3, [x30, #-8] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp q3, q5, [sp] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp q17, q19, [sp, #1008] +# CHECK-NEXT: - - - - - 1.00 1.00 - - - - - ldp q23, q29, [x1, #-1024] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w3, w5, [sp], #0 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - 0.50 0.50 - - - stp wzr, w9, [sp], #252 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w2, wzr, [sp], #-256 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w9, w10, [sp], #4 +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x9, x10, [sp], #4 +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x9, x10, [x2], #-256 +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x20, x30, [sp], #252 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x21, x29, [x2], #504 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x22, x23, [x3], #-512 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x24, x25, [x4], #8 +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp s29, s28, [sp], #252 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp s27, s26, [sp], #-256 +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp s1, s2, [x3], #44 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp d3, d5, [x9], #504 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp d7, d11, [x10], #-512 +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp d2, d3, [x30], #-8 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp q3, q5, [sp], #0 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp q17, q19, [sp], #1008 +# CHECK-NEXT: - - - - - 1.00 1.00 - - - - - ldp q23, q29, [x1], #-1024 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w3, w5, [sp, #0]! +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - 0.50 0.50 - - - stp wzr, w9, [sp, #252]! +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w2, wzr, [sp, #-256]! +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp w9, w10, [sp, #4]! +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x9, x10, [sp, #4]! +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x9, x10, [x2, #-256]! +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldpsw x20, x30, [sp, #252]! +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x21, x29, [x2, #504]! +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x22, x23, [x3, #-512]! +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldp x24, x25, [x4, #8]! +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp s29, s28, [sp, #252]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp s27, s26, [sp, #-256]! +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp s1, s2, [x3, #44]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp d3, d5, [x9, #504]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp d7, d11, [x10, #-512]! +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldp d2, d3, [x30, #-8]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp q3, q5, [sp, #0]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stp q17, q19, [sp, #1008]! +# CHECK-NEXT: - - - - - 1.00 1.00 - - - - - ldp q23, q29, [x1, #-1024]! +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldnp w3, w5, [sp] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stnp wzr, w9, [sp, #252] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldnp w2, wzr, [sp, #-256] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldnp w9, w10, [sp, #4] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldnp x21, x29, [x2, #504] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldnp x22, x23, [x3, #-512] +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ldnp x24, x25, [x4, #8] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldnp s29, s28, [sp, #252] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stnp s27, s26, [sp, #-256] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldnp s1, s2, [x3, #44] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stnp d3, d5, [x9, #504] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stnp d7, d11, [x10, #-512] +# CHECK-NEXT: - - - - 1.00 0.50 0.50 - - - - - ldnp d2, d3, [x30, #-8] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stnp q3, q5, [sp] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 stnp q17, q19, [sp, #1008] +# CHECK-NEXT: - - - - - 1.00 1.00 - - - - - ldnp q23, q29, [x1, #-1024] +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov w3, #983055 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov x10, #-6148914691236517206 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - and w12, w23, w21 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - and w16, w15, w1, lsl #1 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and w9, w4, w10, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - and w3, w30, w11 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and x3, x5, x7, lsl #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and x5, x14, x19, asr #4 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and w3, w17, w19, ror #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and w0, w2, wzr, lsr #17 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and w3, w30, w11, asr #2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - and xzr, x4, x26 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and w3, wzr, w20, ror #2 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - and x7, x20, xzr, asr #63 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - bic x13, x20, x14, lsl #47 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - bic w2, w7, w9 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - orr w2, w7, w0, asr #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - orr x8, x9, x10, lsl #12 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - orn x3, x5, x7, asr #2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - orn w2, w5, w29 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - ands w7, wzr, w9, lsl #1 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - ands x3, x5, x20, ror #63 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - bics w3, w5, w7 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - bics x3, xzr, x3, lsl #1 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - tst w3, w7, lsl #31 +# CHECK-NEXT: 0.25 0.25 0.75 0.75 - - - - - - - - tst x2, x20, asr #2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov x3, x6 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov x3, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov wzr, w2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov w3, w5 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movz w2, #0, lsl #16 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov w2, #-1235 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov x2, #5299989643264 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - mov x2, #0 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movk w3, #0 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movz x4, #0, lsl #16 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movk w5, #0, lsl #16 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movz x6, #0, lsl #32 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movk x7, #0, lsl #32 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movz x8, #0, lsl #48 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - movk x9, #0, lsl #48 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adr x2, #1600 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adrp x21, #6553600 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - adr x0, #262144 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - tbz x12, #62, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - tbz x12, #62, #4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - tbz x12, #62, #-32768 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - tbnz x12, #60, #32764 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - b #4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - b #-4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - b #134217724 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - br x20 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - blr xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ret x10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - ret +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - eret +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - drps diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/cssc-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/cssc-instructions.s new file mode 100644 index 00000000000000..a19a106f4b47ec --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/cssc-instructions.s @@ -0,0 +1,76 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s + +abs w1, w2 +abs x2, x3 +cnt w3, w4 +cnt x4, x5 +ctz w5, w6 +ctz x6, x7 +smax w7, w8, w9 +smax x8, x9, x10 +umax w9, w10, w11 +umax x10, x11, x12 +smin w11, w12, w13 +smin w12, w13, w14 +umin w13, w14, w15 +umin x14, x15, x16 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 abs w1, w2 +# CHECK-NEXT: 1 1 0.25 abs x2, x3 +# CHECK-NEXT: 1 3 1.00 cnt w3, w4 +# CHECK-NEXT: 1 3 1.00 cnt x4, x5 +# CHECK-NEXT: 1 1 0.50 ctz w5, w6 +# CHECK-NEXT: 1 1 0.50 ctz x6, x7 +# CHECK-NEXT: 2 1 0.50 smax w7, w8, w9 +# CHECK-NEXT: 2 1 0.50 smax x8, x9, x10 +# CHECK-NEXT: 2 1 0.50 umax w9, w10, w11 +# CHECK-NEXT: 2 1 0.50 umax x10, x11, x12 +# CHECK-NEXT: 2 1 0.50 smin w11, w12, w13 +# CHECK-NEXT: 2 1 0.50 smin w12, w13, w14 +# CHECK-NEXT: 2 1 0.50 umin w13, w14, w15 +# CHECK-NEXT: 2 1 0.50 umin x14, x15, x16 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - Ampere1BUnitA +# CHECK-NEXT: [0.1] - Ampere1BUnitA +# CHECK-NEXT: [1.0] - Ampere1BUnitB +# CHECK-NEXT: [1.1] - Ampere1BUnitB +# CHECK-NEXT: [2] - Ampere1BUnitBS +# CHECK-NEXT: [3.0] - Ampere1BUnitL +# CHECK-NEXT: [3.1] - Ampere1BUnitL +# CHECK-NEXT: [4.0] - Ampere1BUnitS +# CHECK-NEXT: [4.1] - Ampere1BUnitS +# CHECK-NEXT: [5] - Ampere1BUnitX +# CHECK-NEXT: [6] - Ampere1BUnitY +# CHECK-NEXT: [7] - Ampere1BUnitZ + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] +# CHECK-NEXT: 6.50 6.50 3.50 3.50 2.00 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] Instructions: +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - abs w1, w2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - abs x2, x3 +# CHECK-NEXT: - - - - 1.00 - - - - - - - cnt w3, w4 +# CHECK-NEXT: - - - - 1.00 - - - - - - - cnt x4, x5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ctz w5, w6 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - ctz x6, x7 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - smax w7, w8, w9 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - smax x8, x9, x10 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - umax w9, w10, w11 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - umax x10, x11, x12 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - smin w11, w12, w13 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - smin w12, w13, w14 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - umin w13, w14, w15 +# CHECK-NEXT: 0.75 0.75 0.25 0.25 - - - - - - - - umin x14, x15, x16 diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s new file mode 100644 index 00000000000000..5148522431edbf --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s @@ -0,0 +1,349 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s + +irg x0, x1 +irg sp, x1 +irg x0, sp +irg x0, x1, x2 +irg sp, x1, x2 +addg x0, x1, #0, #1 +addg sp, x2, #32, #3 +addg x0, sp, #64, #5 +addg x3, x4, #1008, #6 +addg x5, x6, #112, #15 +subg x0, x1, #0, #1 +subg sp, x2, #32, #3 +subg x0, sp, #64, #5 +subg x3, x4, #1008, #6 +subg x5, x6, #112, #15 +gmi x0, x1, x2 +gmi x3, sp, x4 +gmi xzr, x0, x30 +gmi x30, x0, xzr +subp x0, x1, x2 +subps x0, x1, x2 +subp x0, sp, sp +subps x0, sp, sp +subps xzr, x0, x1 +subps xzr, sp, sp +stg x0, [x1, #-4096] +stg x1, [x2, #4080] +stg x2, [sp, #16] +stg x3, [x1] +stg sp, [x1] +stzg x0, [x1, #-4096] +stzg x1, [x2, #4080] +stzg x2, [sp, #16] +stzg x3, [x1] +stzg sp, [x1] +stg x0, [x1, #-4096]! +stg x1, [x2, #4080]! +stg x2, [sp, #16]! +stg sp, [sp, #16]! +stzg x0, [x1, #-4096]! +stzg x1, [x2, #4080]! +stzg x2, [sp, #16]! +stzg sp, [sp, #16]! +stg x0, [x1], #-4096 +stg x1, [x2], #4080 +stg x2, [sp], #16 +stg sp, [sp], #16 +stzg x0, [x1], #-4096 +stzg x1, [x2], #4080 +stzg x2, [sp], #16 +stzg sp, [sp], #16 +st2g x0, [x1, #-4096] +st2g x1, [x2, #4080] +st2g x2, [sp, #16] +st2g x3, [x1] +st2g sp, [x1] +stz2g x0, [x1, #-4096] +stz2g x1, [x2, #4080] +stz2g x2, [sp, #16] +stz2g x3, [x1] +stz2g sp, [x1] +st2g x0, [x1, #-4096]! +st2g x1, [x2, #4080]! +st2g x2, [sp, #16]! +st2g sp, [sp, #16]! +stz2g x0, [x1, #-4096]! +stz2g x1, [x2, #4080]! +stz2g x2, [sp, #16]! +stz2g sp, [sp, #16]! +st2g x0, [x1], #-4096 +st2g x1, [x2], #4080 +st2g x2, [sp], #16 +st2g sp, [sp], #16 +stz2g x0, [x1], #-4096 +stz2g x1, [x2], #4080 +stz2g x2, [sp], #16 +stz2g sp, [sp], #16 +stgp x0, x1, [x2, #-1024] +stgp x0, x1, [x2, #1008] +stgp x0, x1, [sp, #16] +stgp xzr, x1, [x2, #16] +stgp x0, xzr, [x2, #16] +stgp x0, xzr, [x2] +stgp x0, x1, [x2, #-1024]! +stgp x0, x1, [x2, #1008]! +stgp x0, x1, [sp, #16]! +stgp xzr, x1, [x2, #16]! +stgp x0, xzr, [x2, #16]! +stgp x0, x1, [x2], #-1024 +stgp x0, x1, [x2], #1008 +stgp x0, x1, [sp], #16 +stgp xzr, x1, [x2], #16 +stgp x0, xzr, [x2], #16 +ldg x0, [x1] +ldg x2, [sp, #-4096] +ldg x3, [x4, #4080] +ldgm x0, [x1] +ldgm x1, [sp] +ldgm xzr, [x2] +stgm x0, [x1] +stgm x1, [sp] +stgm xzr, [x2] +stzgm x0, [x1] +stzgm x1, [sp] +stzgm xzr, [x2] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 1 1.00 U irg x0, x1 +# CHECK-NEXT: 2 1 1.00 U irg sp, x1 +# CHECK-NEXT: 2 1 1.00 U irg x0, sp +# CHECK-NEXT: 2 1 1.00 U irg x0, x1, x2 +# CHECK-NEXT: 2 1 1.00 U irg sp, x1, x2 +# CHECK-NEXT: 1 1 0.50 addg x0, x1, #0, #1 +# CHECK-NEXT: 1 1 0.50 addg sp, x2, #32, #3 +# CHECK-NEXT: 1 1 0.50 addg x0, sp, #64, #5 +# CHECK-NEXT: 1 1 0.50 addg x3, x4, #1008, #6 +# CHECK-NEXT: 1 1 0.50 addg x5, x6, #112, #15 +# CHECK-NEXT: 1 1 0.50 U subg x0, x1, #0, #1 +# CHECK-NEXT: 1 1 0.50 U subg sp, x2, #32, #3 +# CHECK-NEXT: 1 1 0.50 U subg x0, sp, #64, #5 +# CHECK-NEXT: 1 1 0.50 U subg x3, x4, #1008, #6 +# CHECK-NEXT: 1 1 0.50 U subg x5, x6, #112, #15 +# CHECK-NEXT: 1 1 0.25 gmi x0, x1, x2 +# CHECK-NEXT: 1 1 0.25 gmi x3, sp, x4 +# CHECK-NEXT: 1 1 0.25 gmi xzr, x0, x30 +# CHECK-NEXT: 1 1 0.25 gmi x30, x0, xzr +# CHECK-NEXT: 1 1 0.25 subp x0, x1, x2 +# CHECK-NEXT: 1 1 0.25 U subps x0, x1, x2 +# CHECK-NEXT: 1 1 0.25 subp x0, sp, sp +# CHECK-NEXT: 1 1 0.25 U subps x0, sp, sp +# CHECK-NEXT: 1 1 0.25 U subps xzr, x0, x1 +# CHECK-NEXT: 1 1 0.25 U subps xzr, sp, sp +# CHECK-NEXT: 1 1 0.50 * stg x0, [x1, #-4096] +# CHECK-NEXT: 1 1 0.50 * stg x1, [x2, #4080] +# CHECK-NEXT: 1 1 0.50 * stg x2, [sp, #16] +# CHECK-NEXT: 1 1 0.50 * stg x3, [x1] +# CHECK-NEXT: 1 1 0.50 * stg sp, [x1] +# CHECK-NEXT: 1 1 0.50 * stzg x0, [x1, #-4096] +# CHECK-NEXT: 1 1 0.50 * stzg x1, [x2, #4080] +# CHECK-NEXT: 1 1 0.50 * stzg x2, [sp, #16] +# CHECK-NEXT: 1 1 0.50 * stzg x3, [x1] +# CHECK-NEXT: 1 1 0.50 * stzg sp, [x1] +# CHECK-NEXT: 1 1 0.50 * U stg x0, [x1, #-4096]! +# CHECK-NEXT: 1 1 0.50 * U stg x1, [x2, #4080]! +# CHECK-NEXT: 1 1 0.50 * U stg x2, [sp, #16]! +# CHECK-NEXT: 1 1 0.50 * U stg sp, [sp, #16]! +# CHECK-NEXT: 1 1 0.50 * U stzg x0, [x1, #-4096]! +# CHECK-NEXT: 1 1 0.50 * U stzg x1, [x2, #4080]! +# CHECK-NEXT: 1 1 0.50 * U stzg x2, [sp, #16]! +# CHECK-NEXT: 1 1 0.50 * U stzg sp, [sp, #16]! +# CHECK-NEXT: 1 1 0.50 * U stg x0, [x1], #-4096 +# CHECK-NEXT: 1 1 0.50 * U stg x1, [x2], #4080 +# CHECK-NEXT: 1 1 0.50 * U stg x2, [sp], #16 +# CHECK-NEXT: 1 1 0.50 * U stg sp, [sp], #16 +# CHECK-NEXT: 1 1 0.50 * U stzg x0, [x1], #-4096 +# CHECK-NEXT: 1 1 0.50 * U stzg x1, [x2], #4080 +# CHECK-NEXT: 1 1 0.50 * U stzg x2, [sp], #16 +# CHECK-NEXT: 1 1 0.50 * U stzg sp, [sp], #16 +# CHECK-NEXT: 2 1 1.00 * st2g x0, [x1, #-4096] +# CHECK-NEXT: 2 1 1.00 * st2g x1, [x2, #4080] +# CHECK-NEXT: 2 1 1.00 * st2g x2, [sp, #16] +# CHECK-NEXT: 2 1 1.00 * st2g x3, [x1] +# CHECK-NEXT: 2 1 1.00 * st2g sp, [x1] +# CHECK-NEXT: 2 1 1.00 * stz2g x0, [x1, #-4096] +# CHECK-NEXT: 2 1 1.00 * stz2g x1, [x2, #4080] +# CHECK-NEXT: 2 1 1.00 * stz2g x2, [sp, #16] +# CHECK-NEXT: 2 1 1.00 * stz2g x3, [x1] +# CHECK-NEXT: 2 1 1.00 * stz2g sp, [x1] +# CHECK-NEXT: 2 1 1.00 * U st2g x0, [x1, #-4096]! +# CHECK-NEXT: 2 1 1.00 * U st2g x1, [x2, #4080]! +# CHECK-NEXT: 2 1 1.00 * U st2g x2, [sp, #16]! +# CHECK-NEXT: 2 1 1.00 * U st2g sp, [sp, #16]! +# CHECK-NEXT: 2 1 1.00 * U stz2g x0, [x1, #-4096]! +# CHECK-NEXT: 2 1 1.00 * U stz2g x1, [x2, #4080]! +# CHECK-NEXT: 2 1 1.00 * U stz2g x2, [sp, #16]! +# CHECK-NEXT: 2 1 1.00 * U stz2g sp, [sp, #16]! +# CHECK-NEXT: 2 1 1.00 * U st2g x0, [x1], #-4096 +# CHECK-NEXT: 2 1 1.00 * U st2g x1, [x2], #4080 +# CHECK-NEXT: 2 1 1.00 * U st2g x2, [sp], #16 +# CHECK-NEXT: 2 1 1.00 * U st2g sp, [sp], #16 +# CHECK-NEXT: 2 1 1.00 * U stz2g x0, [x1], #-4096 +# CHECK-NEXT: 2 1 1.00 * U stz2g x1, [x2], #4080 +# CHECK-NEXT: 2 1 1.00 * U stz2g x2, [sp], #16 +# CHECK-NEXT: 2 1 1.00 * U stz2g sp, [sp], #16 +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [x2, #-1024] +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [x2, #1008] +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [sp, #16] +# CHECK-NEXT: 2 1 1.00 * stgp xzr, x1, [x2, #16] +# CHECK-NEXT: 2 1 1.00 * stgp x0, xzr, [x2, #16] +# CHECK-NEXT: 2 1 1.00 * stgp x0, xzr, [x2] +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [x2, #-1024]! +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [x2, #1008]! +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [sp, #16]! +# CHECK-NEXT: 2 1 1.00 * stgp xzr, x1, [x2, #16]! +# CHECK-NEXT: 2 1 1.00 * stgp x0, xzr, [x2, #16]! +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [x2], #-1024 +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [x2], #1008 +# CHECK-NEXT: 2 1 1.00 * stgp x0, x1, [sp], #16 +# CHECK-NEXT: 2 1 1.00 * stgp xzr, x1, [x2], #16 +# CHECK-NEXT: 2 1 1.00 * stgp x0, xzr, [x2], #16 +# CHECK-NEXT: 2 4 0.50 * ldg x0, [x1] +# CHECK-NEXT: 2 4 0.50 * ldg x2, [sp, #-4096] +# CHECK-NEXT: 2 4 0.50 * ldg x3, [x4, #4080] +# CHECK-NEXT: 2 4 0.50 * U ldgm x0, [x1] +# CHECK-NEXT: 2 4 0.50 * U ldgm x1, [sp] +# CHECK-NEXT: 2 4 0.50 * U ldgm xzr, [x2] +# CHECK-NEXT: 1 1 0.50 U stgm x0, [x1] +# CHECK-NEXT: 1 1 0.50 U stgm x1, [sp] +# CHECK-NEXT: 1 1 0.50 U stgm xzr, [x2] +# CHECK-NEXT: 1 1 0.50 U stzgm x0, [x1] +# CHECK-NEXT: 1 1 0.50 U stzgm x1, [sp] +# CHECK-NEXT: 1 1 0.50 U stzgm xzr, [x2] + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - Ampere1BUnitA +# CHECK-NEXT: [0.1] - Ampere1BUnitA +# CHECK-NEXT: [1.0] - Ampere1BUnitB +# CHECK-NEXT: [1.1] - Ampere1BUnitB +# CHECK-NEXT: [2] - Ampere1BUnitBS +# CHECK-NEXT: [3.0] - Ampere1BUnitL +# CHECK-NEXT: [3.1] - Ampere1BUnitL +# CHECK-NEXT: [4.0] - Ampere1BUnitS +# CHECK-NEXT: [4.1] - Ampere1BUnitS +# CHECK-NEXT: [5] - Ampere1BUnitX +# CHECK-NEXT: [6] - Ampere1BUnitY +# CHECK-NEXT: [7] - Ampere1BUnitZ + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] +# CHECK-NEXT: 2.50 2.50 13.00 13.00 5.00 3.00 3.00 58.00 58.00 - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] Instructions: +# CHECK-NEXT: - - 0.50 0.50 1.00 - - - - - - - irg x0, x1 +# CHECK-NEXT: - - 0.50 0.50 1.00 - - - - - - - irg sp, x1 +# CHECK-NEXT: - - 0.50 0.50 1.00 - - - - - - - irg x0, sp +# CHECK-NEXT: - - 0.50 0.50 1.00 - - - - - - - irg x0, x1, x2 +# CHECK-NEXT: - - 0.50 0.50 1.00 - - - - - - - irg sp, x1, x2 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - addg x0, x1, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - addg sp, x2, #32, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - addg x0, sp, #64, #5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - addg x3, x4, #1008, #6 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - addg x5, x6, #112, #15 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - subg x0, x1, #0, #1 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - subg sp, x2, #32, #3 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - subg x0, sp, #64, #5 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - subg x3, x4, #1008, #6 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - - - subg x5, x6, #112, #15 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - gmi x0, x1, x2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - gmi x3, sp, x4 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - gmi xzr, x0, x30 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - gmi x30, x0, xzr +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subp x0, x1, x2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subps x0, x1, x2 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subp x0, sp, sp +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subps x0, sp, sp +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subps xzr, x0, x1 +# CHECK-NEXT: 0.25 0.25 0.25 0.25 - - - - - - - - subps xzr, sp, sp +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x0, [x1, #-4096] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x1, [x2, #4080] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x2, [sp, #16] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x3, [x1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg sp, [x1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x0, [x1, #-4096] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x1, [x2, #4080] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x2, [sp, #16] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x3, [x1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg sp, [x1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x1, [x2, #4080]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x2, [sp, #16]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg sp, [sp, #16]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x1, [x2, #4080]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x2, [sp, #16]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg sp, [sp, #16]! +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x0, [x1], #-4096 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x1, [x2], #4080 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg x2, [sp], #16 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stg sp, [sp], #16 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x0, [x1], #-4096 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x1, [x2], #4080 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg x2, [sp], #16 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzg sp, [sp], #16 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x0, [x1, #-4096] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x1, [x2, #4080] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x2, [sp, #16] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x3, [x1] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g sp, [x1] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x0, [x1, #-4096] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x1, [x2, #4080] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x2, [sp, #16] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x3, [x1] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g sp, [x1] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x1, [x2, #4080]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x2, [sp, #16]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g sp, [sp, #16]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x1, [x2, #4080]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x2, [sp, #16]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g sp, [sp, #16]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x0, [x1], #-4096 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x1, [x2], #4080 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g x2, [sp], #16 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - st2g sp, [sp], #16 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x0, [x1], #-4096 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x1, [x2], #4080 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g x2, [sp], #16 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stz2g sp, [sp], #16 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [x2, #-1024] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [x2, #1008] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [sp, #16] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp xzr, x1, [x2, #16] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, xzr, [x2, #16] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, xzr, [x2] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [x2, #-1024]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [x2, #1008]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [sp, #16]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp xzr, x1, [x2, #16]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, xzr, [x2, #16]! +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [x2], #-1024 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [x2], #1008 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, x1, [sp], #16 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp xzr, x1, [x2], #16 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - - stgp x0, xzr, [x2], #16 +# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - ldg x0, [x1] +# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - ldg x2, [sp, #-4096] +# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - ldg x3, [x4, #4080] +# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - ldgm x0, [x1] +# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - ldgm x1, [sp] +# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - ldgm xzr, [x2] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stgm x0, [x1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stgm x1, [sp] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stgm xzr, [x2] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzgm x0, [x1] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzgm x1, [sp] +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - stzgm xzr, [x2] diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/neon-instructions.s new file mode 100644 index 00000000000000..827c13a24763de --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/neon-instructions.s @@ -0,0 +1,3235 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s + +abs d29, d24 +abs v0.16b, v0.16b +abs v0.2d, v0.2d +abs v0.2s, v0.2s +abs v0.4h, v0.4h +abs v0.4s, v0.4s +abs v0.8b, v0.8b +abs v0.8h, v0.8h +add d17, d31, d29 +add v0.8b, v0.8b, v0.8b +addhn v0.2s, v0.2d, v0.2d +addhn v0.4h, v0.4s, v0.4s +addhn v0.8b, v0.8h, v0.8h +addhn2 v0.16b, v0.8h, v0.8h +addhn2 v0.4s, v0.2d, v0.2d +addhn2 v0.8h, v0.4s, v0.4s +addp v0.2d, v0.2d, v0.2d +addp v0.8b, v0.8b, v0.8b +and v0.8b, v0.8b, v0.8b +bic v0.4h, #15, lsl #8 +bic v0.8b, v0.8b, v0.8b +bif v0.16b, v0.16b, v0.16b +bit v0.16b, v0.16b, v0.16b +bsl v0.8b, v0.8b, v0.8b +cls v0.16b, v0.16b +cls v0.2s, v0.2s +cls v0.4h, v0.4h +cls v0.4s, v0.4s +cls v0.8b, v0.8b +cls v0.8h, v0.8h +clz v0.16b, v0.16b +clz v0.2s, v0.2s +clz v0.4h, v0.4h +clz v0.4s, v0.4s +clz v0.8b, v0.8b +clz v0.8h, v0.8h +cmeq d20, d21, 0 +cmeq d20, d21, d22 +cmeq v0.16b, v0.16b, 0 +cmeq v0.16b, v0.16b, v0.16b +cmge d20, d21, 0 +cmge d20, d21, d22 +cmge v0.4h, v0.4h, v0.4h +cmge v0.8b, v0.8b, 0 +cmgt d20, d21, 0 +cmgt d20, d21, d22 +cmgt v0.2s, v0.2s, 0 +cmgt v0.4s, v0.4s, v0.4s +cmhi d20, d21, d22 +cmhi v0.8h, v0.8h, v0.8h +cmhs d20, d21, d22 +cmhs v0.8b, v0.8b, v0.8b +cmle d20, d21, 0 +cmle v0.2d, v0.2d, 0 +cmlt d20, d21, 0 +cmlt v0.8h, v0.8h, 0 +cmtst d20, d21, d22 +cmtst v0.2s, v0.2s, v0.2s +cnt v0.16b, v0.16b +cnt v0.8b, v0.8b +dup v0.16b,w28 +dup v0.2d,x28 +dup v0.2s,w28 +dup v0.4h,w28 +dup v0.4s,w28 +dup v0.8b,w28 +dup v0.8h,w28 +eor v0.16b, v0.16b, v0.16b +ext v0.16b, v0.16b, v0.16b, #3 +ext v0.8b, v0.8b, v0.8b, #3 +fabd d29, d24, d20 +fabd s29, s24, s20 +fabd v0.4s, v0.4s, v0.4s +fabs v0.2d, v0.2d +fabs v0.2s, v0.2s +fabs v0.4h, v0.4h +fabs v0.4s, v0.4s +fabs v0.8h, v0.8h +facge d20, d21, d22 +facge s10, s11, s12 +facge v0.4s, v0.4s, v0.4s +facgt d20, d21, d22 +facgt s10, s11, s12 +facgt v0.2d, v0.2d, v0.2d +fadd v0.4s, v0.4s, v0.4s +faddp v0.2s, v0.2s, v0.2s +faddp v0.4s, v0.4s, v0.4s +fcmeq d20, d21, #0.0 +fcmeq d20, d21, d22 +fcmeq s10, s11, #0.0 +fcmeq s10, s11, s12 +fcmeq v0.2s, v0.2s, #0.0 +fcmeq v0.2s, v0.2s, v0.2s +fcmge d20, d21, #0.0 +fcmge d20, d21, d22 +fcmge s10, s11, #0.0 +fcmge s10, s11, s12 +fcmge v0.2d, v0.2d, #0.0 +fcmge v0.4s, v0.4s, v0.4s +fcmgt d20, d21, #0.0 +fcmgt d20, d21, d22 +fcmgt s10, s11, #0.0 +fcmgt s10, s11, s12 +fcmgt v0.4s, v0.4s, #0.0 +fcmgt v0.4s, v0.4s, v0.4s +fcmle d20, d21, #0.0 +fcmle s10, s11, #0.0 +fcmle v0.2d, v0.2d, #0.0 +fcmlt d20, d21, #0.0 +fcmlt s10, s11, #0.0 +fcmlt v0.4s, v0.4s, #0.0 +fcvtas d21, d14 +fcvtas s12, s13 +fcvtas v0.2d, v0.2d +fcvtas v0.2s, v0.2s +fcvtas v0.4h, v0.4h +fcvtas v0.4s, v0.4s +fcvtas v0.8h, v0.8h +fcvtau d21, d14 +fcvtau s12, s13 +fcvtau v0.2d, v0.2d +fcvtau v0.2s, v0.2s +fcvtau v0.4h, v0.4h +fcvtau v0.4s, v0.4s +fcvtau v0.8h, v0.8h +fcvtl v0.2d, v0.2s +fcvtl v0.4s, v0.4h +fcvtl2 v0.2d, v0.4s +fcvtl2 v0.4s, v0.8h +fcvtms d21, d14 +fcvtms s22, s13 +fcvtms v0.2d, v0.2d +fcvtms v0.2s, v0.2s +fcvtms v0.4h, v0.4h +fcvtms v0.4s, v0.4s +fcvtms v0.8h, v0.8h +fcvtmu d21, d14 +fcvtmu s12, s13 +fcvtmu v0.2d, v0.2d +fcvtmu v0.2s, v0.2s +fcvtmu v0.4h, v0.4h +fcvtmu v0.4s, v0.4s +fcvtmu v0.8h, v0.8h +fcvtn v0.2s, v0.2d +fcvtn v0.4h, v0.4s +fcvtn2 v0.4s, v0.2d +fcvtn2 v0.8h, v0.4s +fcvtns d21, d14 +fcvtns s22, s13 +fcvtns v0.2d, v0.2d +fcvtns v0.2s, v0.2s +fcvtns v0.4h, v0.4h +fcvtns v0.4s, v0.4s +fcvtns v0.8h, v0.8h +fcvtnu d21, d14 +fcvtnu s12, s13 +fcvtnu v0.2d, v0.2d +fcvtnu v0.2s, v0.2s +fcvtnu v0.4h, v0.4h +fcvtnu v0.4s, v0.4s +fcvtnu v0.8h, v0.8h +fcvtps d21, d14 +fcvtps s22, s13 +fcvtps v0.2d, v0.2d +fcvtps v0.2s, v0.2s +fcvtps v0.4h, v0.4h +fcvtps v0.4s, v0.4s +fcvtps v0.8h, v0.8h +fcvtpu d21, d14 +fcvtpu s12, s13 +fcvtpu v0.2d, v0.2d +fcvtpu v0.2s, v0.2s +fcvtpu v0.4h, v0.4h +fcvtpu v0.4s, v0.4s +fcvtpu v0.8h, v0.8h +fcvtxn s22, d13 +fcvtxn v0.2s, v0.2d +fcvtxn2 v0.4s, v0.2d +fcvtzs d21, d12, #1 +fcvtzs d21, d14 +fcvtzs s12, s13 +fcvtzs s21, s12, #1 +fcvtzs v0.2d, v0.2d +fcvtzs v0.2d, v0.2d, #3 +fcvtzs v0.2s, v0.2s +fcvtzs v0.2s, v0.2s, #3 +fcvtzs v0.4h, v0.4h +fcvtzs v0.4s, v0.4s +fcvtzs v0.4s, v0.4s, #3 +fcvtzs v0.8h, v0.8h +fcvtzu d21, d12, #1 +fcvtzu d21, d14 +fcvtzu s12, s13 +fcvtzu s21, s12, #1 +fcvtzu v0.2d, v0.2d +fcvtzu v0.2d, v0.2d, #3 +fcvtzu v0.2s, v0.2s +fcvtzu v0.2s, v0.2s, #3 +fcvtzu v0.4h, v0.4h +fcvtzu v0.4s, v0.4s +fcvtzu v0.4s, v0.4s, #3 +fcvtzu v0.8h, v0.8h +fdiv v0.2s, v0.2s, v0.2s +fmax v0.2d, v0.2d, v0.2d +fmax v0.2s, v0.2s, v0.2s +fmax v0.4s, v0.4s, v0.4s +fmaxnm v0.2d, v0.2d, v0.2d +fmaxnm v0.2s, v0.2s, v0.2s +fmaxnm v0.4s, v0.4s, v0.4s +fmaxnmp v0.2d, v0.2d, v0.2d +fmaxnmp v0.2s, v0.2s, v0.2s +fmaxnmp v0.4s, v0.4s, v0.4s +fmaxp v0.2d, v0.2d, v0.2d +fmaxp v0.2s, v0.2s, v0.2s +fmaxp v0.4s, v0.4s, v0.4s +fmin v0.2d, v0.2d, v0.2d +fmin v0.2s, v0.2s, v0.2s +fmin v0.4s, v0.4s, v0.4s +fminnm v0.2d, v0.2d, v0.2d +fminnm v0.2s, v0.2s, v0.2s +fminnm v0.4s, v0.4s, v0.4s +fminnmp v0.2d, v0.2d, v0.2d +fminnmp v0.2s, v0.2s, v0.2s +fminnmp v0.4s, v0.4s, v0.4s +fminp v0.2d, v0.2d, v0.2d +fminp v0.2s, v0.2s, v0.2s +fminp v0.4s, v0.4s, v0.4s +fmla d0, d1, v0.d[1] +fmla s0, s1, v0.s[3] +fmla v0.2s, v0.2s, v0.2s +fmls d0, d4, v0.d[1] +fmls s3, s5, v0.s[3] +fmls v0.2s, v0.2s, v0.2s +fmov v0.2d, #-1.25 +fmov v0.2s, #13.0 +fmov v0.4s, #1.0 +fmul d0, d1, v0.d[1] +fmul s0, s1, v0.s[3] +fmul v0.2s, v0.2s, v0.2s +fmulx d0, d4, v0.d[1] +fmulx d23, d11, d1 +fmulx s20, s22, s15 +fmulx s3, s5, v0.s[3] +fmulx v0.2d, v0.2d, v0.2d +fmulx v0.2s, v0.2s, v0.2s +fmulx v0.4s, v0.4s, v0.4s +fneg v0.2d, v0.2d +fneg v0.2s, v0.2s +fneg v0.4h, v0.4h +fneg v0.4s, v0.4s +fneg v0.8h, v0.8h +frecpe d13, d13 +frecpe s19, s14 +frecpe v0.2d, v0.2d +frecpe v0.2s, v0.2s +frecpe v0.4h, v0.4h +frecpe v0.4s, v0.4s +frecpe v0.8h, v0.8h +frecps v0.4s, v0.4s, v0.4s +frecps d22, d30, d21 +frecps s21, s16, s13 +frecpx d16, d19 +frecpx s18, s10 +frinta v0.2d, v0.2d +frinta v0.2s, v0.2s +frinta v0.4h, v0.4h +frinta v0.4s, v0.4s +frinta v0.8h, v0.8h +frinti v0.2d, v0.2d +frinti v0.2s, v0.2s +frinti v0.4h, v0.4h +frinti v0.4s, v0.4s +frinti v0.8h, v0.8h +frintm v0.2d, v0.2d +frintm v0.2s, v0.2s +frintm v0.4h, v0.4h +frintm v0.4s, v0.4s +frintm v0.8h, v0.8h +frintn v0.2d, v0.2d +frintn v0.2s, v0.2s +frintn v0.4h, v0.4h +frintn v0.4s, v0.4s +frintn v0.8h, v0.8h +frintp v0.2d, v0.2d +frintp v0.2s, v0.2s +frintp v0.4h, v0.4h +frintp v0.4s, v0.4s +frintp v0.8h, v0.8h +frintx v0.2d, v0.2d +frintx v0.2s, v0.2s +frintx v0.4h, v0.4h +frintx v0.4s, v0.4s +frintx v0.8h, v0.8h +frintz v0.2d, v0.2d +frintz v0.2s, v0.2s +frintz v0.4h, v0.4h +frintz v0.4s, v0.4s +frintz v0.8h, v0.8h +frsqrte d21, d12 +frsqrte s22, s13 +frsqrte v0.2d, v0.2d +frsqrte v0.2s, v0.2s +frsqrte v0.4h, v0.4h +frsqrte v0.4s, v0.4s +frsqrte v0.8h, v0.8h +frsqrts d8, d22, d18 +frsqrts s21, s5, s12 +frsqrts v0.2d, v0.2d, v0.2d +fsqrt v0.2d, v0.2d +fsqrt v0.2s, v0.2s +fsqrt v0.4h, v0.4h +fsqrt v0.4s, v0.4s +fsqrt v0.8h, v0.8h +fsub v0.2s, v0.2s, v0.2s +ld1 { v0.16b }, [x0] +ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +ld1 { v0.4s, v1.4s }, [sp], #32 +ld1 { v0.4s, v1.4s, v2.4s }, [sp] +ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +ld1 { v0.8h }, [x15], x2 +ld1 { v0.8h, v1.8h }, [x15] +ld1 { v0.b }[9], [x0] +ld1 { v0.b }[9], [x0], #1 +ld1r { v0.16b }, [x0] +ld1r { v0.16b }, [x0], #1 +ld1r { v0.8h }, [x15] +ld1r { v0.8h }, [x15], #2 +ld2 { v0.16b, v1.16b }, [x0], x1 +ld2 { v0.8b, v1.8b }, [x0] +ld2 { v0.h, v1.h }[7], [x15] +ld2 { v0.h, v1.h }[7], [x15], #4 +ld2r { v0.2d, v1.2d }, [x0] +ld2r { v0.2d, v1.2d }, [x0], #16 +ld2r { v0.4s, v1.4s }, [sp] +ld2r { v0.4s, v1.4s }, [sp], #8 +ld3 { v0.4h, v1.4h, v2.4h }, [x15] +ld3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +ld3 { v0.s, v1.s, v2.s }[3], [sp] +ld3 { v0.s, v1.s, v2.s }[3], [sp], x3 +ld3r { v0.4h, v1.4h, v2.4h }, [x15] +ld3r { v0.4h, v1.4h, v2.4h }, [x15], #6 +ld3r { v0.8b, v1.8b, v2.8b }, [x0] +ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3 +ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0] +ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32 +ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0 +ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp] +ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7 +ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30 +mla v0.8b, v0.8b, v0.8b +mls v0.4h, v0.4h, v0.4h +mov b0, v0.b[15] +mov d6, v0.d[1] +mov h2, v0.h[5] +mov s17, v0.s[2] +mov v2.b[0], v0.b[0] +mov v2.h[1], v0.h[1] +mov v2.s[2], v0.s[2] +mov v2.d[1], v0.d[1] +mov v0.b[0], w8 +mov v0.h[1], w8 +mov v0.s[2], w8 +mov v0.d[1], x8 +mov v0.16b, v0.16b +mov v0.8b, v0.8b +movi d15, #0xff00ff00ff00ff +movi v0.16b, #31 +movi v0.2d, #0xff0000ff0000ffff +movi v0.2s, #8, msl #8 +movi v0.4s, #255, lsl #24 +movi v0.8b, #255 +mul v0.8b, v0.8b, v0.8b +mvni v0.2s, 0 +mvni v0.4s, #16, msl #16 +neg d29, d24 +neg v0.16b, v0.16b +neg v0.2d, v0.2d +neg v0.2s, v0.2s +neg v0.4h, v0.4h +neg v0.4s, v0.4s +neg v0.8b, v0.8b +neg v0.8h, v0.8h +not v0.16b, v0.16b +not v0.8b, v0.8b +orn v0.16b, v0.16b, v0.16b +orr v0.16b, v0.16b, v0.16b +orr v0.8h, #31 +pmul v0.16b, v0.16b, v0.16b +pmul v0.8b, v0.8b, v0.8b +pmull v0.8h, v0.8b, v0.8b +pmull2 v0.8h, v0.16b, v0.16b +raddhn v0.2s, v0.2d, v0.2d +raddhn v0.4h, v0.4s, v0.4s +raddhn v0.8b, v0.8h, v0.8h +raddhn2 v0.16b, v0.8h, v0.8h +raddhn2 v0.4s, v0.2d, v0.2d +raddhn2 v0.8h, v0.4s, v0.4s +rbit v0.16b, v0.16b +rbit v0.8b, v0.8b +rev16 v21.8b, v1.8b +rev16 v30.16b, v31.16b +rev32 v0.4h, v9.4h +rev32 v21.8b, v1.8b +rev32 v30.16b, v31.16b +rev32 v4.8h, v7.8h +rev64 v0.16b, v31.16b +rev64 v1.8b, v9.8b +rev64 v13.4h, v21.4h +rev64 v2.8h, v4.8h +rev64 v4.2s, v0.2s +rev64 v6.4s, v8.4s +rshrn v0.2s, v0.2d, #3 +rshrn v0.4h, v0.4s, #3 +rshrn v0.8b, v0.8h, #3 +rshrn2 v0.16b, v0.8h, #3 +rshrn2 v0.4s, v0.2d, #3 +rshrn2 v0.8h, v0.4s, #3 +rsubhn v0.2s, v0.2d, v0.2d +rsubhn v0.4h, v0.4s, v0.4s +rsubhn v0.8b, v0.8h, v0.8h +rsubhn2 v0.16b, v0.8h, v0.8h +rsubhn2 v0.4s, v0.2d, v0.2d +rsubhn2 v0.8h, v0.4s, v0.4s +saba v0.16b, v0.16b, v0.16b +sabal v0.2d, v0.2s, v0.2s +sabal v0.4s, v0.4h, v0.4h +sabal v0.8h, v0.8b, v0.8b +sabal2 v0.2d, v0.4s, v0.4s +sabal2 v0.4s, v0.8h, v0.8h +sabal2 v0.8h, v0.16b, v0.16b +sabd v0.4h, v0.4h, v0.4h +sabdl v0.2d, v0.2s, v0.2s +sabdl v0.4s, v0.4h, v0.4h +sabdl v0.8h, v0.8b, v0.8b +sabdl2 v0.2d, v0.4s, v0.4s +sabdl2 v0.4s, v0.8h, v0.8h +sabdl2 v0.8h, v0.16b, v0.16b +sadalp v0.1d, v0.2s +sadalp v0.2d, v0.4s +sadalp v0.2s, v0.4h +sadalp v0.4h, v0.8b +sadalp v0.4s, v0.8h +sadalp v0.8h, v0.16b +saddl v0.2d, v0.2s, v0.2s +saddl v0.4s, v0.4h, v0.4h +saddl v0.8h, v0.8b, v0.8b +saddl2 v0.2d, v0.4s, v0.4s +saddl2 v0.4s, v0.8h, v0.8h +saddl2 v0.8h, v0.16b, v0.16b +saddlp v0.1d, v0.2s +saddlp v0.2d, v0.4s +saddlp v0.2s, v0.4h +saddlp v0.4h, v0.8b +saddlp v0.4s, v0.8h +saddlp v0.8h, v0.16b +saddw v0.2d, v0.2d, v0.2s +saddw v0.4s, v0.4s, v0.4h +saddw v0.8h, v0.8h, v0.8b +saddw2 v0.2d, v0.2d, v0.4s +saddw2 v0.4s, v0.4s, v0.8h +saddw2 v0.8h, v0.8h, v0.16b +scvtf d21, d12 +scvtf d21, d12, #64 +scvtf s22, s13 +scvtf s22, s13, #32 +scvtf v0.2d, v0.2d +scvtf v0.2d, v0.2d, #3 +scvtf v0.2s, v0.2s +scvtf v0.2s, v0.2s, #3 +scvtf v0.4h, v0.4h +scvtf v0.4s, v0.4s +scvtf v0.4s, v0.4s, #3 +scvtf v0.8h, v0.8h +shadd v0.8b, v0.8b, v0.8b +shl d7, d10, #12 +shl v0.16b, v0.16b, #3 +shl v0.2d, v0.2d, #3 +shl v0.4h, v0.4h, #3 +shl v0.4s, v0.4s, #3 +shll v0.2d, v0.2s, #32 +shll v0.4s, v0.4h, #16 +shll v0.8h, v0.8b, #8 +shll v0.2d, v0.2s, #32 +shll v0.4s, v0.4h, #16 +shll v0.8h, v0.8b, #8 +shll2 v0.2d, v0.4s, #32 +shll2 v0.4s, v0.8h, #16 +shll2 v0.8h, v0.16b, #8 +shll2 v0.2d, v0.4s, #32 +shll2 v0.4s, v0.8h, #16 +shll2 v0.8h, v0.16b, #8 +shrn v0.2s, v0.2d, #3 +shrn v0.4h, v0.4s, #3 +shrn v0.8b, v0.8h, #3 +shrn2 v0.16b, v0.8h, #3 +shrn2 v0.4s, v0.2d, #3 +shrn2 v0.8h, v0.4s, #3 +shsub v0.2s, v0.2s, v0.2s +shsub v0.4h, v0.4h, v0.4h +sli d10, d14, #12 +sli v0.16b, v0.16b, #3 +sli v0.2d, v0.2d, #3 +sli v0.2s, v0.2s, #3 +sli v0.4h, v0.4h, #3 +sli v0.4s, v0.4s, #3 +sli v0.8b, v0.8b, #3 +sli v0.8h, v0.8h, #3 +smax v0.2s, v0.2s, v0.2s +smax v0.4h, v0.4h, v0.4h +smax v0.8b, v0.8b, v0.8b +smaxp v0.2s, v0.2s, v0.2s +smaxp v0.4h, v0.4h, v0.4h +smaxp v0.8b, v0.8b, v0.8b +smin v0.16b, v0.16b, v0.16b +smin v0.4s, v0.4s, v0.4s +smin v0.8h, v0.8h, v0.8h +sminp v0.16b, v0.16b, v0.16b +sminp v0.4s, v0.4s, v0.4s +sminp v0.8h, v0.8h, v0.8h +smlal v0.2d, v0.2s, v0.2s +smlal v0.4s, v0.4h, v0.4h +smlal v0.8h, v0.8b, v0.8b +smlal2 v0.2d, v0.4s, v0.4s +smlal2 v0.4s, v0.8h, v0.8h +smlal2 v0.8h, v0.16b, v0.16b +smlsl v0.2d, v0.2s, v0.2s +smlsl v0.4s, v0.4h, v0.4h +smlsl v0.8h, v0.8b, v0.8b +smlsl2 v0.2d, v0.4s, v0.4s +smlsl2 v0.4s, v0.8h, v0.8h +smlsl2 v0.8h, v0.16b, v0.16b +smull v0.2d, v0.2s, v0.2s +smull v0.4s, v0.4h, v0.4h +smull v0.8h, v0.8b, v0.8b +smull2 v0.2d, v0.4s, v0.4s +smull2 v0.4s, v0.8h, v0.8h +smull2 v0.8h, v0.16b, v0.16b +sqabs b19, b14 +sqabs d18, d12 +sqabs h21, h15 +sqabs s20, s12 +sqabs v0.16b, v0.16b +sqabs v0.2d, v0.2d +sqabs v0.2s, v0.2s +sqabs v0.4h, v0.4h +sqabs v0.4s, v0.4s +sqabs v0.8b, v0.8b +sqabs v0.8h, v0.8h +sqadd b20, b11, b15 +sqadd v0.16b, v0.16b, v0.16b +sqadd v0.2s, v0.2s, v0.2s +sqdmlal d19, s24, s12 +sqdmlal d8, s9, v0.s[1] +sqdmlal s0, h0, v0.h[3] +sqdmlal s17, h27, h12 +sqdmlal v0.2d, v0.2s, v0.2s +sqdmlal v0.4s, v0.4h, v0.4h +sqdmlal2 v0.2d, v0.4s, v0.4s +sqdmlal2 v0.4s, v0.8h, v0.8h +sqdmlsl d12, s23, s13 +sqdmlsl d8, s9, v0.s[1] +sqdmlsl s0, h0, v0.h[3] +sqdmlsl s14, h12, h25 +sqdmlsl v0.2d, v0.2s, v0.2s +sqdmlsl v0.4s, v0.4h, v0.4h +sqdmlsl2 v0.2d, v0.4s, v0.4s +sqdmlsl2 v0.4s, v0.8h, v0.8h +sqdmulh h10, h11, h12 +sqdmulh h7, h15, v0.h[3] +sqdmulh s15, s14, v0.s[1] +sqdmulh s20, s21, s2 +sqdmulh v0.2s, v0.2s, v0.2s +sqdmulh v0.4s, v0.4s, v0.4s +sqdmull d1, s1, v0.s[1] +sqdmull d15, s22, s12 +sqdmull s1, h1, v0.h[3] +sqdmull s12, h22, h12 +sqdmull v0.2d, v0.2s, v0.2s +sqdmull v0.4s, v0.4h, v0.4h +sqdmull2 v0.2d, v0.4s, v0.4s +sqdmull2 v0.4s, v0.8h, v0.8h +sqneg b19, b14 +sqneg d18, d12 +sqneg h21, h15 +sqneg s20, s12 +sqneg v0.16b, v0.16b +sqneg v0.2d, v0.2d +sqneg v0.2s, v0.2s +sqneg v0.4h, v0.4h +sqneg v0.4s, v0.4s +sqneg v0.8b, v0.8b +sqneg v0.8h, v0.8h +sqrdmulh h10, h11, h12 +sqrdmulh h7, h15, v0.h[3] +sqrdmulh s15, s14, v0.s[1] +sqrdmulh s20, s21, s2 +sqrdmulh v0.4h, v0.4h, v0.4h +sqrdmulh v0.8h, v0.8h, v0.8h +sqrshl d31, d31, d31 +sqrshl h3, h4, h15 +sqrshl v0.2s, v0.2s, v0.2s +sqrshl v0.4h, v0.4h, v0.4h +sqrshl v0.8b, v0.8b, v0.8b +sqrshrn b10, h13, #2 +sqrshrn h15, s10, #6 +sqrshrn s15, d12, #9 +sqrshrn v0.2s, v0.2d, #3 +sqrshrn v0.4h, v0.4s, #3 +sqrshrn v0.8b, v0.8h, #3 +sqrshrn2 v0.16b, v0.8h, #3 +sqrshrn2 v0.4s, v0.2d, #3 +sqrshrn2 v0.8h, v0.4s, #3 +sqrshrun b17, h10, #6 +sqrshrun h10, s13, #15 +sqrshrun s22, d16, #31 +sqrshrun v0.2s, v0.2d, #3 +sqrshrun v0.4h, v0.4s, #3 +sqrshrun v0.8b, v0.8h, #3 +sqrshrun2 v0.16b, v0.8h, #3 +sqrshrun2 v0.4s, v0.2d, #3 +sqrshrun2 v0.8h, v0.4s, #3 +sqshl b11, b19, #7 +sqshl d15, d16, #51 +sqshl d31, d31, d31 +sqshl h13, h18, #11 +sqshl h3, h4, h15 +sqshl s14, s17, #22 +sqshl v0.16b, v0.16b, #3 +sqshl v0.2d, v0.2d, #3 +sqshl v0.2s, v0.2s, #3 +sqshl v0.2s, v0.2s, v0.2s +sqshl v0.4h, v0.4h, #3 +sqshl v0.4h, v0.4h, v0.4h +sqshl v0.4s, v0.4s, #3 +sqshl v0.8b, v0.8b, #3 +sqshl v0.8b, v0.8b, v0.8b +sqshl v0.8h, v0.8h, #3 +sqshlu b15, b18, #6 +sqshlu d11, d13, #32 +sqshlu h19, h17, #6 +sqshlu s16, s14, #25 +sqshlu v0.16b, v0.16b, #3 +sqshlu v0.2d, v0.2d, #3 +sqshlu v0.2s, v0.2s, #3 +sqshlu v0.4h, v0.4h, #3 +sqshlu v0.4s, v0.4s, #3 +sqshlu v0.8b, v0.8b, #3 +sqshlu v0.8h, v0.8h, #3 +sqshrn b10, h15, #5 +sqshrn h17, s10, #4 +sqshrn s18, d10, #31 +sqshrn v0.2s, v0.2d, #3 +sqshrn v0.4h, v0.4s, #3 +sqshrn v0.8b, v0.8h, #3 +sqshrn2 v0.16b, v0.8h, #3 +sqshrn2 v0.4s, v0.2d, #3 +sqshrn2 v0.8h, v0.4s, #3 +sqshrun b15, h10, #7 +sqshrun h20, s14, #3 +sqshrun s10, d15, #15 +sqshrun v0.2s, v0.2d, #3 +sqshrun v0.4h, v0.4s, #3 +sqshrun v0.8b, v0.8h, #3 +sqshrun2 v0.16b, v0.8h, #3 +sqshrun2 v0.4s, v0.2d, #3 +sqshrun2 v0.8h, v0.4s, #3 +sqsub s20, s10, s7 +sqsub v0.2d, v0.2d, v0.2d +sqsub v0.4s, v0.4s, v0.4s +sqsub v0.8b, v0.8b, v0.8b +sqxtn b18, h18 +sqxtn h20, s17 +sqxtn s19, d14 +sqxtn v0.2s, v0.2d +sqxtn v0.4h, v0.4s +sqxtn v0.8b, v0.8h +sqxtn2 v0.16b, v0.8h +sqxtn2 v0.4s, v0.2d +sqxtn2 v0.8h, v0.4s +sqxtun b19, h14 +sqxtun h21, s15 +sqxtun s20, d12 +sqxtun v0.2s, v0.2d +sqxtun v0.4h, v0.4s +sqxtun v0.8b, v0.8h +sqxtun2 v0.16b, v0.8h +sqxtun2 v0.4s, v0.2d +sqxtun2 v0.8h, v0.4s +srhadd v0.2s, v0.2s, v0.2s +srhadd v0.4h, v0.4h, v0.4h +srhadd v0.8b, v0.8b, v0.8b +sri d10, d12, #14 +sri v0.16b, v0.16b, #3 +sri v0.2d, v0.2d, #3 +sri v0.2s, v0.2s, #3 +sri v0.4h, v0.4h, #3 +sri v0.4s, v0.4s, #3 +sri v0.8b, v0.8b, #3 +sri v0.8h, v0.8h, #3 +srshl d16, d16, d16 +srshl v0.2s, v0.2s, v0.2s +srshl v0.4h, v0.4h, v0.4h +srshl v0.8b, v0.8b, v0.8b +srshr d19, d18, #7 +srshr v0.16b, v0.16b, #3 +srshr v0.2d, v0.2d, #3 +srshr v0.2s, v0.2s, #3 +srshr v0.4h, v0.4h, #3 +srshr v0.4s, v0.4s, #3 +srshr v0.8b, v0.8b, #3 +srshr v0.8h, v0.8h, #3 +srsra d15, d11, #19 +srsra v0.16b, v0.16b, #3 +srsra v0.2d, v0.2d, #3 +srsra v0.2s, v0.2s, #3 +srsra v0.4h, v0.4h, #3 +srsra v0.4s, v0.4s, #3 +srsra v0.8b, v0.8b, #3 +srsra v0.8h, v0.8h, #3 +sshl d31, d31, d31 +sshl v0.2d, v0.2d, v0.2d +sshl v0.2s, v0.2s, v0.2s +sshl v0.4h, v0.4h, v0.4h +sshl v0.8b, v0.8b, v0.8b +sshll v0.2d, v0.2s, #3 +sshll2 v0.4s, v0.8h, #3 +sshr d15, d16, #12 +sshr v0.16b, v0.16b, #3 +sshr v0.2d, v0.2d, #3 +sshr v0.2s, v0.2s, #3 +sshr v0.4h, v0.4h, #3 +sshr v0.4s, v0.4s, #3 +sshr v0.8b, v0.8b, #3 +sshr v0.8h, v0.8h, #3 +ssra d18, d12, #21 +ssra v0.16b, v0.16b, #3 +ssra v0.2d, v0.2d, #3 +ssra v0.2s, v0.2s, #3 +ssra v0.4h, v0.4h, #3 +ssra v0.4s, v0.4s, #3 +ssra v0.8b, v0.8b, #3 +ssra v0.8h, v0.8h, #3 +ssubl v0.2d, v0.2s, v0.2s +ssubl v0.4s, v0.4h, v0.4h +ssubl v0.8h, v0.8b, v0.8b +ssubl2 v0.2d, v0.4s, v0.4s +ssubl2 v0.4s, v0.8h, v0.8h +ssubl2 v0.8h, v0.16b, v0.16b +ssubw v0.2d, v0.2d, v0.2s +ssubw v0.4s, v0.4s, v0.4h +ssubw v0.8h, v0.8h, v0.8b +ssubw2 v0.2d, v0.2d, v0.4s +ssubw2 v0.4s, v0.4s, v0.8h +ssubw2 v0.8h, v0.8h, v0.16b +st1 { v0.16b }, [x0] +st1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +st1 { v0.4s, v1.4s }, [sp], #32 +st1 { v0.4s, v1.4s, v2.4s }, [sp] +st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +st1 { v0.8h }, [x15], x2 +st1 { v0.8h, v1.8h }, [x15] +st1 { v0.d }[1], [x0] +st1 { v0.d }[1], [x0], #8 +st2 { v0.16b, v1.16b }, [x0], x1 +st2 { v0.8b, v1.8b }, [x0] +st2 { v0.s, v1.s }[3], [sp] +st2 { v0.s, v1.s }[3], [sp], #8 +st3 { v0.4h, v1.4h, v2.4h }, [x15] +st3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +st3 { v0.h, v1.h, v2.h }[7], [x15] +st3 { v0.h, v1.h, v2.h }[7], [x15], #6 +st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] +st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 +sub d15, d5, d16 +sub v0.2d, v0.2d, v0.2d +suqadd b19, b14 +suqadd d18, d22 +suqadd h20, h15 +suqadd s21, s12 +suqadd v0.16b, v0.16b +suqadd v0.2d, v0.2d +suqadd v0.2s, v0.2s +suqadd v0.4h, v0.4h +suqadd v0.4s, v0.4s +suqadd v0.8b, v0.8b +suqadd v0.8h, v0.8h +tbl v0.16b, { v0.16b }, v0.16b +tbl v0.16b, { v0.16b, v1.16b }, v0.16b +tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +tbl v0.8b, { v0.16b }, v0.8b +tbl v0.8b, { v0.16b, v1.16b }, v0.8b +tbl v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +tbx v0.16b, { v0.16b }, v0.16b +tbx v0.16b, { v0.16b, v1.16b }, v0.16b +tbx v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +tbx v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +tbx v0.8b, { v0.16b }, v0.8b +tbx v0.8b, { v0.16b, v1.16b }, v0.8b +tbx v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +tbx v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +trn1 v0.16b, v0.16b, v0.16b +trn1 v0.2d, v0.2d, v0.2d +trn1 v0.2s, v0.2s, v0.2s +trn1 v0.4h, v0.4h, v0.4h +trn1 v0.4s, v0.4s, v0.4s +trn1 v0.8b, v0.8b, v0.8b +trn1 v0.8h, v0.8h, v0.8h +trn2 v0.16b, v0.16b, v0.16b +trn2 v0.2d, v0.2d, v0.2d +trn2 v0.2s, v0.2s, v0.2s +trn2 v0.4h, v0.4h, v0.4h +trn2 v0.4s, v0.4s, v0.4s +trn2 v0.8b, v0.8b, v0.8b +trn2 v0.8h, v0.8h, v0.8h +uaba v0.8b, v0.8b, v0.8b +uabal v0.2d, v0.2s, v0.2s +uabal v0.4s, v0.4h, v0.4h +uabal v0.8h, v0.8b, v0.8b +uabal2 v0.2d, v0.4s, v0.4s +uabal2 v0.4s, v0.8h, v0.8h +uabal2 v0.8h, v0.16b, v0.16b +uabd v0.4h, v0.4h, v0.4h +uabdl v0.2d, v0.2s, v0.2s +uabdl v0.4s, v0.4h, v0.4h +uabdl v0.8h, v0.8b, v0.8b +uabdl2 v0.2d, v0.4s, v0.4s +uabdl2 v0.4s, v0.8h, v0.8h +uabdl2 v0.8h, v0.16b, v0.16b +uadalp v0.1d, v0.2s +uadalp v0.2d, v0.4s +uadalp v0.2s, v0.4h +uadalp v0.4h, v0.8b +uadalp v0.4s, v0.8h +uadalp v0.8h, v0.16b +uaddl v0.2d, v0.2s, v0.2s +uaddl v0.4s, v0.4h, v0.4h +uaddl v0.8h, v0.8b, v0.8b +uaddl2 v0.2d, v0.4s, v0.4s +uaddl2 v0.4s, v0.8h, v0.8h +uaddl2 v0.8h, v0.16b, v0.16b +uaddlp v0.1d, v0.2s +uaddlp v0.2d, v0.4s +uaddlp v0.2s, v0.4h +uaddlp v0.4h, v0.8b +uaddlp v0.4s, v0.8h +uaddlp v0.8h, v0.16b +uaddw v0.2d, v0.2d, v0.2s +uaddw v0.4s, v0.4s, v0.4h +uaddw v0.8h, v0.8h, v0.8b +uaddw2 v0.2d, v0.2d, v0.4s +uaddw2 v0.4s, v0.4s, v0.8h +uaddw2 v0.8h, v0.8h, v0.16b +ucvtf d21, d14 +ucvtf d21, d14, #64 +ucvtf s22, s13 +ucvtf s22, s13, #32 +ucvtf v0.2d, v0.2d +ucvtf v0.2d, v0.2d, #3 +ucvtf v0.2s, v0.2s +ucvtf v0.2s, v0.2s, #3 +ucvtf v0.4h, v0.4h +ucvtf v0.4s, v0.4s +ucvtf v0.4s, v0.4s, #3 +ucvtf v0.8h, v0.8h +uhadd v0.16b, v0.16b, v0.16b +uhadd v0.8h, v0.8h, v0.8h +uhsub v0.4s, v0.4s, v0.4s +umax v0.16b, v0.16b, v0.16b +umax v0.4s, v0.4s, v0.4s +umax v0.8h, v0.8h, v0.8h +umaxp v0.16b, v0.16b, v0.16b +umaxp v0.4s, v0.4s, v0.4s +umaxp v0.8h, v0.8h, v0.8h +umin v0.2s, v0.2s, v0.2s +umin v0.4h, v0.4h, v0.4h +umin v0.8b, v0.8b, v0.8b +uminp v0.2s, v0.2s, v0.2s +uminp v0.4h, v0.4h, v0.4h +uminp v0.8b, v0.8b, v0.8b +umlal v0.2d, v0.2s, v0.2s +umlal v0.4s, v0.4h, v0.4h +umlal v0.8h, v0.8b, v0.8b +umlal2 v0.2d, v0.4s, v0.4s +umlal2 v0.4s, v0.8h, v0.8h +umlal2 v0.8h, v0.16b, v0.16b +umlsl v0.2d, v0.2s, v0.2s +umlsl v0.4s, v0.4h, v0.4h +umlsl v0.8h, v0.8b, v0.8b +umlsl2 v0.2d, v0.4s, v0.4s +umlsl2 v0.4s, v0.8h, v0.8h +umlsl2 v0.8h, v0.16b, v0.16b +umull v0.2d, v0.2s, v0.2s +umull v0.4s, v0.4h, v0.4h +umull v0.8h, v0.8b, v0.8b +umull2 v0.2d, v0.4s, v0.4s +umull2 v0.4s, v0.8h, v0.8h +umull2 v0.8h, v0.16b, v0.16b +uqadd h0, h1, h5 +uqadd v0.8h, v0.8h, v0.8h +uqrshl b11, b20, b30 +uqrshl s23, s20, s16 +uqrshl v0.16b, v0.16b, v0.16b +uqrshl v0.4s, v0.4s, v0.4s +uqrshl v0.4s, v0.4s, v0.4s +uqrshl v0.8h, v0.8h, v0.8h +uqrshrn b10, h12, #5 +uqrshrn h12, s10, #14 +uqrshrn s10, d10, #25 +uqrshrn v0.2s, v0.2d, #3 +uqrshrn v0.4h, v0.4s, #3 +uqrshrn v0.8b, v0.8h, #3 +uqrshrn2 v0.16b, v0.8h, #3 +uqrshrn2 v0.4s, v0.2d, #3 +uqrshrn2 v0.8h, v0.4s, #3 +uqshl b11, b20, b30 +uqshl b18, b15, #6 +uqshl d15, d12, #19 +uqshl h11, h18, #7 +uqshl s14, s19, #18 +uqshl s23, s20, s16 +uqshl v0.16b, v0.16b, #3 +uqshl v0.16b, v0.16b, v0.16b +uqshl v0.2d, v0.2d, #3 +uqshl v0.2d, v0.2d, v0.2d +uqshl v0.2s, v0.2s, #3 +uqshl v0.4h, v0.4h, #3 +uqshl v0.4s, v0.4s, #3 +uqshl v0.4s, v0.4s, v0.4s +uqshl v0.8b, v0.8b, #3 +uqshl v0.8h, v0.8h, #3 +uqshl v0.8h, v0.8h, v0.8h +uqshrn b12, h10, #7 +uqshrn h10, s14, #5 +uqshrn s10, d12, #13 +uqshrn v0.2s, v0.2d, #3 +uqshrn v0.4h, v0.4s, #3 +uqshrn v0.8b, v0.8h, #3 +uqshrn2 v0.16b, v0.8h, #3 +uqshrn2 v0.4s, v0.2d, #3 +uqshrn2 v0.8h, v0.4s, #3 +uqsub d16, d16, d16 +uqsub v0.4h, v0.4h, v0.4h +uqxtn b18, h18 +uqxtn h20, s17 +uqxtn s19, d14 +uqxtn v0.2s, v0.2d +uqxtn v0.4h, v0.4s +uqxtn v0.8b, v0.8h +uqxtn2 v0.16b, v0.8h +uqxtn2 v0.4s, v0.2d +uqxtn2 v0.8h, v0.4s +urecpe v0.2s, v0.2s +urecpe v0.4s, v0.4s +urhadd v0.16b, v0.16b, v0.16b +urhadd v0.4s, v0.4s, v0.4s +urhadd v0.8h, v0.8h, v0.8h +urshl d8, d7, d4 +urshl v0.16b, v0.16b, v0.16b +urshl v0.2d, v0.2d, v0.2d +urshl v0.4s, v0.4s, v0.4s +urshl v0.8h, v0.8h, v0.8h +urshr d20, d23, #31 +urshr v0.16b, v0.16b, #3 +urshr v0.2d, v0.2d, #3 +urshr v0.2s, v0.2s, #3 +urshr v0.4h, v0.4h, #3 +urshr v0.4s, v0.4s, #3 +urshr v0.8b, v0.8b, #3 +urshr v0.8h, v0.8h, #3 +ursqrte v0.2s, v0.2s +ursqrte v0.4s, v0.4s +ursra d18, d10, #13 +ursra v0.16b, v0.16b, #3 +ursra v0.2d, v0.2d, #3 +ursra v0.2s, v0.2s, #3 +ursra v0.4h, v0.4h, #3 +ursra v0.4s, v0.4s, #3 +ursra v0.8b, v0.8b, #3 +ursra v0.8h, v0.8h, #3 +ushl d0, d0, d0 +ushl v0.16b, v0.16b, v0.16b +ushl v0.4s, v0.4s, v0.4s +ushl v0.8h, v0.8h, v0.8h +ushll v0.4s, v0.4h, #3 +ushll2 v0.8h, v0.16b, #3 +ushr d10, d17, #18 +ushr v0.16b, v0.16b, #3 +ushr v0.2d, v0.2d, #3 +ushr v0.2s, v0.2s, #3 +ushr v0.4h, v0.4h, #3 +ushr v0.4s, v0.4s, #3 +ushr v0.8b, v0.8b, #3 +ushr v0.8h, v0.8h, #3 +usqadd b19, b14 +usqadd d18, d22 +usqadd h20, h15 +usqadd s21, s12 +usqadd v0.16b, v0.16b +usqadd v0.2d, v0.2d +usqadd v0.2s, v0.2s +usqadd v0.4h, v0.4h +usqadd v0.4s, v0.4s +usqadd v0.8b, v0.8b +usqadd v0.8h, v0.8h +usra d20, d13, #61 +usra v0.16b, v0.16b, #3 +usra v0.2d, v0.2d, #3 +usra v0.2s, v0.2s, #3 +usra v0.4h, v0.4h, #3 +usra v0.4s, v0.4s, #3 +usra v0.8b, v0.8b, #3 +usra v0.8h, v0.8h, #3 +usubl v0.2d, v0.2s, v0.2s +usubl v0.4s, v0.4h, v0.4h +usubl v0.8h, v0.8b, v0.8b +usubl2 v0.2d, v0.4s, v0.4s +usubl2 v0.4s, v0.8h, v0.8h +usubl2 v0.8h, v0.16b, v0.16b +usubw v0.2d, v0.2d, v0.2s +usubw v0.4s, v0.4s, v0.4h +usubw v0.8h, v0.8h, v0.8b +usubw2 v0.2d, v0.2d, v0.4s +usubw2 v0.4s, v0.4s, v0.8h +usubw2 v0.8h, v0.8h, v0.16b +uzp1 v0.16b, v0.16b, v0.16b +uzp1 v0.2d, v0.2d, v0.2d +uzp1 v0.2s, v0.2s, v0.2s +uzp1 v0.4h, v0.4h, v0.4h +uzp1 v0.4s, v0.4s, v0.4s +uzp1 v0.8b, v0.8b, v0.8b +uzp1 v0.8h, v0.8h, v0.8h +uzp2 v0.16b, v0.16b, v0.16b +uzp2 v0.2d, v0.2d, v0.2d +uzp2 v0.2s, v0.2s, v0.2s +uzp2 v0.4h, v0.4h, v0.4h +uzp2 v0.4s, v0.4s, v0.4s +uzp2 v0.8b, v0.8b, v0.8b +uzp2 v0.8h, v0.8h, v0.8h +xtn v0.2s, v0.2d +xtn v0.4h, v0.4s +xtn v0.8b, v0.8h +xtn2 v0.16b, v0.8h +xtn2 v0.4s, v0.2d +xtn2 v0.8h, v0.4s +zip1 v0.16b, v0.16b, v0.16b +zip1 v0.2d, v0.2d, v0.2d +zip1 v0.2s, v0.2s, v0.2s +zip1 v0.4h, v0.4h, v0.4h +zip1 v0.4s, v0.4s, v0.4s +zip1 v0.8b, v0.8b, v0.8b +zip1 v0.8h, v0.8h, v0.8h +zip2 v0.16b, v0.16b, v0.16b +zip2 v0.2d, v0.2d, v0.2d +zip2 v0.2s, v0.2s, v0.2s +zip2 v0.4h, v0.4h, v0.4h +zip2 v0.4s, v0.4s, v0.4s +zip2 v0.8b, v0.8b, v0.8b +zip2 v0.8h, v0.8h, v0.8h + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 2 0.50 abs d29, d24 +# CHECK-NEXT: 1 2 0.50 abs v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 abs v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 abs v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 abs v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 abs v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 abs v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 abs v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 add d17, d31, d29 +# CHECK-NEXT: 1 2 0.50 add v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 2 6 1.00 addhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 addhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 2 6 1.00 addhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 addhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 addhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 addhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 addp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 addp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 and v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 bic v0.4h, #15, lsl #8 +# CHECK-NEXT: 1 2 0.50 bic v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 bif v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 bit v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 bsl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 cls v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 cls v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 cls v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 cls v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 cls v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 cls v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 clz v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 clz v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 clz v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 clz v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 clz v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 clz v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 cmeq d20, d21, #0 +# CHECK-NEXT: 1 2 0.50 cmeq d20, d21, d22 +# CHECK-NEXT: 1 2 0.50 cmeq v0.16b, v0.16b, #0 +# CHECK-NEXT: 1 2 0.50 cmeq v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 cmge d20, d21, #0 +# CHECK-NEXT: 1 2 0.50 cmge d20, d21, d22 +# CHECK-NEXT: 1 2 0.50 cmge v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 cmge v0.8b, v0.8b, #0 +# CHECK-NEXT: 1 2 0.50 cmgt d20, d21, #0 +# CHECK-NEXT: 1 2 0.50 cmgt d20, d21, d22 +# CHECK-NEXT: 1 2 0.50 cmgt v0.2s, v0.2s, #0 +# CHECK-NEXT: 1 2 0.50 cmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 cmhi d20, d21, d22 +# CHECK-NEXT: 1 2 0.50 cmhi v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 cmhs d20, d21, d22 +# CHECK-NEXT: 1 2 0.50 cmhs v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 cmle d20, d21, #0 +# CHECK-NEXT: 1 2 0.50 cmle v0.2d, v0.2d, #0 +# CHECK-NEXT: 1 2 0.50 cmlt d20, d21, #0 +# CHECK-NEXT: 1 2 0.50 cmlt v0.8h, v0.8h, #0 +# CHECK-NEXT: 1 2 0.50 cmtst d20, d21, d22 +# CHECK-NEXT: 1 2 0.50 cmtst v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 cnt v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 cnt v0.8b, v0.8b +# CHECK-NEXT: 1 5 1.00 dup v0.16b, w28 +# CHECK-NEXT: 1 5 1.00 dup v0.2d, x28 +# CHECK-NEXT: 1 5 1.00 dup v0.2s, w28 +# CHECK-NEXT: 1 5 1.00 dup v0.4h, w28 +# CHECK-NEXT: 1 5 1.00 dup v0.4s, w28 +# CHECK-NEXT: 1 5 1.00 dup v0.8b, w28 +# CHECK-NEXT: 1 5 1.00 dup v0.8h, w28 +# CHECK-NEXT: 1 2 0.50 eor v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 ext v0.16b, v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 ext v0.8b, v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 fabd d29, d24, d20 +# CHECK-NEXT: 1 3 0.50 fabd s29, s24, s20 +# CHECK-NEXT: 1 3 0.50 fabd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fabs v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fabs v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fabs v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fabs v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fabs v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 facge d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 facge s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 facge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 facgt d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 facgt s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 facgt v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 faddp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 faddp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcmeq d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmeq d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 fcmeq s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmeq s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 fcmeq v0.2s, v0.2s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmeq v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcmge d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 fcmge s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 fcmge v0.2d, v0.2d, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcmgt d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 fcmgt s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 fcmgt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcmle d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle v0.2d, v0.2d, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcvtas d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtas s12, s13 +# CHECK-NEXT: 1 3 0.50 fcvtas v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtas v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtas v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtas v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtas v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtau d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtau s12, s13 +# CHECK-NEXT: 1 3 0.50 fcvtau v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtau v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtau v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtau v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtau v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtl v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtl v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtl2 v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtl2 v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtms d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtms s22, s13 +# CHECK-NEXT: 1 3 0.50 fcvtms v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtms v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtms v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtms v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtms v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtmu d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtmu s12, s13 +# CHECK-NEXT: 1 3 0.50 fcvtmu v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtmu v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtmu v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtmu v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtmu v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtn v0.2s, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtn v0.4h, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtn2 v0.4s, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtns d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtns s22, s13 +# CHECK-NEXT: 1 3 0.50 fcvtns v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtns v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtns v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtns v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtns v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtnu d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtnu s12, s13 +# CHECK-NEXT: 1 3 0.50 fcvtnu v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtnu v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtnu v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtnu v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtnu v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtps d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtps s22, s13 +# CHECK-NEXT: 1 3 0.50 fcvtps v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtps v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtps v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtps v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtps v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtpu d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtpu s12, s13 +# CHECK-NEXT: 1 3 0.50 fcvtpu v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtpu v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtpu v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtpu v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtpu v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtxn s22, d13 +# CHECK-NEXT: 1 3 0.50 fcvtxn v0.2s, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtxn2 v0.4s, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtzs d21, d12, #1 +# CHECK-NEXT: 1 3 0.50 fcvtzs d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtzs s12, s13 +# CHECK-NEXT: 1 3 0.50 fcvtzs s21, s12, #1 +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 fcvtzs v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fcvtzu d21, d12, #1 +# CHECK-NEXT: 1 3 0.50 fcvtzu d21, d14 +# CHECK-NEXT: 1 3 0.50 fcvtzu s12, s13 +# CHECK-NEXT: 1 3 0.50 fcvtzu s21, s12, #1 +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 fcvtzu v0.8h, v0.8h +# CHECK-NEXT: 1 12 1.00 fdiv v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fmax v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fmax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fmax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fmaxnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fmaxnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fmaxnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fmaxnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fmaxnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fmaxnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fmaxp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fmaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fmaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fmin v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fmin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fmin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fminnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fminnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fminnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fminnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fminnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fminnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fminp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fmla d0, d1, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmla s0, s1, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmla v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmls d0, d4, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmls s3, s5, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmls v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 fmov v0.2d, #-1.25000000 +# CHECK-NEXT: 1 2 0.50 fmov v0.2s, #13.00000000 +# CHECK-NEXT: 1 2 0.50 fmov v0.4s, #1.00000000 +# CHECK-NEXT: 1 4 0.50 fmul d0, d1, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmul s0, s1, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmul v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmulx d0, d4, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmulx d23, d11, d1 +# CHECK-NEXT: 1 4 0.50 fmulx s20, s22, s15 +# CHECK-NEXT: 1 4 0.50 fmulx s3, s5, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmulx v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fmulx v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmulx v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fneg v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 fneg v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fneg v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 fneg v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fneg v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 frecpe d13, d13 +# CHECK-NEXT: 2 6 1.00 frecpe s19, s14 +# CHECK-NEXT: 2 6 1.00 frecpe v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 frecpe v0.2s, v0.2s +# CHECK-NEXT: 2 6 1.00 frecpe v0.4h, v0.4h +# CHECK-NEXT: 2 6 1.00 frecpe v0.4s, v0.4s +# CHECK-NEXT: 2 6 1.00 frecpe v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frecps v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frecps d22, d30, d21 +# CHECK-NEXT: 1 3 0.50 frecps s21, s16, s13 +# CHECK-NEXT: 1 3 0.50 frecpx d16, d19 +# CHECK-NEXT: 1 3 0.50 frecpx s18, s10 +# CHECK-NEXT: 1 3 0.50 frinta v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 frinta v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 frinta v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 frinta v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frinta v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frinti v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 frinti v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 frinti v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 frinti v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frinti v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frintm v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 frintm v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 frintm v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 frintm v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frintm v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frintn v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 frintn v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 frintn v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 frintn v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frintn v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frintp v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 frintp v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 frintp v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 frintp v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frintp v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frintx v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 frintx v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 frintx v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 frintx v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frintx v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frintz v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 frintz v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 frintz v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 frintz v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 frintz v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 frsqrte d21, d12 +# CHECK-NEXT: 2 6 1.00 frsqrte s22, s13 +# CHECK-NEXT: 2 6 1.00 frsqrte v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 frsqrte v0.2s, v0.2s +# CHECK-NEXT: 2 6 1.00 frsqrte v0.4h, v0.4h +# CHECK-NEXT: 2 6 1.00 frsqrte v0.4s, v0.4s +# CHECK-NEXT: 2 6 1.00 frsqrte v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 frsqrts d8, d22, d18 +# CHECK-NEXT: 1 3 0.50 frsqrts s21, s5, s12 +# CHECK-NEXT: 1 3 0.50 frsqrts v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 63 1.00 fsqrt v0.2d, v0.2d +# CHECK-NEXT: 1 33 1.00 fsqrt v0.2s, v0.2s +# CHECK-NEXT: 1 39 1.00 fsqrt v0.4h, v0.4h +# CHECK-NEXT: 1 33 1.00 fsqrt v0.4s, v0.4s +# CHECK-NEXT: 1 39 1.00 fsqrt v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 fsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 * ld1 { v0.16b }, [x0] +# CHECK-NEXT: 3 5 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: 4 5 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: 2 4 1.00 * ld1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: 3 5 1.50 * ld1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: 4 5 2.00 * ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: 1 4 0.50 * ld1 { v0.8h }, [x15], x2 +# CHECK-NEXT: 2 4 1.00 * ld1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: 2 6 0.50 * ld1 { v0.b }[9], [x0] +# CHECK-NEXT: 2 6 0.50 * ld1 { v0.b }[9], [x0], #1 +# CHECK-NEXT: 2 6 0.50 * ld1r { v0.16b }, [x0] +# CHECK-NEXT: 2 6 0.50 * ld1r { v0.16b }, [x0], #1 +# CHECK-NEXT: 2 6 0.50 * ld1r { v0.8h }, [x15] +# CHECK-NEXT: 2 6 0.50 * ld1r { v0.8h }, [x15], #2 +# CHECK-NEXT: 4 6 1.00 * ld2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: 5 8 1.50 * ld2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: 4 6 1.00 * ld2 { v0.h, v1.h }[7], [x15] +# CHECK-NEXT: 4 6 1.00 * ld2 { v0.h, v1.h }[7], [x15], #4 +# CHECK-NEXT: 4 6 1.00 * ld2r { v0.2d, v1.2d }, [x0] +# CHECK-NEXT: 4 6 1.00 * ld2r { v0.2d, v1.2d }, [x0], #16 +# CHECK-NEXT: 4 6 1.00 * ld2r { v0.4s, v1.4s }, [sp] +# CHECK-NEXT: 4 6 1.00 * ld2r { v0.4s, v1.4s }, [sp], #8 +# CHECK-NEXT: 6 9 1.50 * ld3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: 6 8 1.50 * ld3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: 6 7 1.50 * ld3 { v0.s, v1.s, v2.s }[3], [sp] +# CHECK-NEXT: 6 7 1.50 * ld3 { v0.s, v1.s, v2.s }[3], [sp], x3 +# CHECK-NEXT: 6 7 1.50 * ld3r { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: 6 7 1.50 * ld3r { v0.4h, v1.4h, v2.4h }, [x15], #6 +# CHECK-NEXT: 6 7 1.50 * ld3r { v0.8b, v1.8b, v2.8b }, [x0] +# CHECK-NEXT: 6 7 1.50 * ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3 +# CHECK-NEXT: 12 11 2.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: 12 10 2.00 * ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: 8 7 2.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0] +# CHECK-NEXT: 8 7 2.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32 +# CHECK-NEXT: 8 7 2.00 * ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0 +# CHECK-NEXT: 4 5 2.00 * ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp] +# CHECK-NEXT: 4 5 2.00 * ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7 +# CHECK-NEXT: 8 7 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: 8 7 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30 +# CHECK-NEXT: 1 3 0.50 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 mls v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 mov b0, v0.b[15] +# CHECK-NEXT: 1 3 0.50 mov d6, v0.d[1] +# CHECK-NEXT: 1 3 0.50 mov h2, v0.h[5] +# CHECK-NEXT: 1 3 0.50 mov s17, v0.s[2] +# CHECK-NEXT: 1 2 0.50 mov v2.b[0], v0.b[0] +# CHECK-NEXT: 1 2 0.50 mov v2.h[1], v0.h[1] +# CHECK-NEXT: 1 2 0.50 mov v2.s[2], v0.s[2] +# CHECK-NEXT: 1 2 0.50 mov v2.d[1], v0.d[1] +# CHECK-NEXT: 2 7 1.00 mov v0.b[0], w8 +# CHECK-NEXT: 2 7 1.00 mov v0.h[1], w8 +# CHECK-NEXT: 2 7 1.00 mov v0.s[2], w8 +# CHECK-NEXT: 2 7 1.00 mov v0.d[1], x8 +# CHECK-NEXT: 1 2 0.50 mov v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 mov v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 movi d15, #0xff00ff00ff00ff +# CHECK-NEXT: 1 2 0.50 movi v0.16b, #31 +# CHECK-NEXT: 1 2 0.50 movi v0.2d, #0xff0000ff0000ffff +# CHECK-NEXT: 1 2 0.50 movi v0.2s, #8, msl #8 +# CHECK-NEXT: 1 2 0.50 movi v0.4s, #255, lsl #24 +# CHECK-NEXT: 1 2 0.50 movi v0.8b, #255 +# CHECK-NEXT: 1 3 0.50 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 mvni v0.2s, #0 +# CHECK-NEXT: 1 2 0.50 mvni v0.4s, #16, msl #16 +# CHECK-NEXT: 1 3 0.50 neg d29, d24 +# CHECK-NEXT: 1 3 0.50 neg v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 neg v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 neg v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 neg v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 neg v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 neg v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 neg v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 mvn v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 mvn v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 orn v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 mov v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 orr v0.8h, #31 +# CHECK-NEXT: 1 2 0.50 pmul v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 pmul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 pmull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 pmull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 2 6 1.00 raddhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 raddhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 2 6 1.00 raddhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 raddhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 raddhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 raddhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 rbit v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 rbit v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 rev16 v21.8b, v1.8b +# CHECK-NEXT: 1 2 0.50 rev16 v30.16b, v31.16b +# CHECK-NEXT: 1 2 0.50 rev32 v0.4h, v9.4h +# CHECK-NEXT: 1 2 0.50 rev32 v21.8b, v1.8b +# CHECK-NEXT: 1 2 0.50 rev32 v30.16b, v31.16b +# CHECK-NEXT: 1 2 0.50 rev32 v4.8h, v7.8h +# CHECK-NEXT: 1 2 0.50 rev64 v0.16b, v31.16b +# CHECK-NEXT: 1 2 0.50 rev64 v1.8b, v9.8b +# CHECK-NEXT: 1 2 0.50 rev64 v13.4h, v21.4h +# CHECK-NEXT: 1 2 0.50 rev64 v2.8h, v4.8h +# CHECK-NEXT: 1 2 0.50 rev64 v4.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 rev64 v6.4s, v8.4s +# CHECK-NEXT: 2 6 1.00 rshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 rshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 2 6 1.00 rshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 rshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 rshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 rshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 2 6 1.00 rsubhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 rsubhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 2 6 1.00 rsubhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 rsubhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 2 6 1.00 rsubhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 2 6 1.00 rsubhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 saba v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 sabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 sabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 sabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 sabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 sabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 sabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 sabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 sabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 sabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 sabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 sabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sadalp v0.1d, v0.2s +# CHECK-NEXT: 1 2 0.50 sadalp v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 sadalp v0.2s, v0.4h +# CHECK-NEXT: 1 2 0.50 sadalp v0.4h, v0.8b +# CHECK-NEXT: 1 2 0.50 sadalp v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 sadalp v0.8h, v0.16b +# CHECK-NEXT: 1 2 0.50 saddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 saddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 saddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 saddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 saddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 saddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 saddlp v0.1d, v0.2s +# CHECK-NEXT: 1 2 0.50 saddlp v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 saddlp v0.2s, v0.4h +# CHECK-NEXT: 1 2 0.50 saddlp v0.4h, v0.8b +# CHECK-NEXT: 1 2 0.50 saddlp v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 saddlp v0.8h, v0.16b +# CHECK-NEXT: 1 2 0.50 saddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 2 0.50 saddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 2 0.50 saddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 2 0.50 saddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 saddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 saddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 scvtf d21, d12 +# CHECK-NEXT: 1 3 0.50 scvtf d21, d12, #64 +# CHECK-NEXT: 1 3 0.50 scvtf s22, s13 +# CHECK-NEXT: 1 3 0.50 scvtf s22, s13, #32 +# CHECK-NEXT: 1 3 0.50 scvtf v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 scvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 scvtf v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 scvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 scvtf v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 scvtf v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 scvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 scvtf v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 shadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 shl d7, d10, #12 +# CHECK-NEXT: 1 3 0.50 shl v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 shl v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 shl v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 shl v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 shll v0.2d, v0.2s, #32 +# CHECK-NEXT: 1 3 0.50 shll v0.4s, v0.4h, #16 +# CHECK-NEXT: 1 3 0.50 shll v0.8h, v0.8b, #8 +# CHECK-NEXT: 1 3 0.50 shll v0.2d, v0.2s, #32 +# CHECK-NEXT: 1 3 0.50 shll v0.4s, v0.4h, #16 +# CHECK-NEXT: 1 3 0.50 shll v0.8h, v0.8b, #8 +# CHECK-NEXT: 1 3 0.50 shll2 v0.2d, v0.4s, #32 +# CHECK-NEXT: 1 3 0.50 shll2 v0.4s, v0.8h, #16 +# CHECK-NEXT: 1 3 0.50 shll2 v0.8h, v0.16b, #8 +# CHECK-NEXT: 1 3 0.50 shll2 v0.2d, v0.4s, #32 +# CHECK-NEXT: 1 3 0.50 shll2 v0.4s, v0.8h, #16 +# CHECK-NEXT: 1 3 0.50 shll2 v0.8h, v0.16b, #8 +# CHECK-NEXT: 2 6 1.00 shrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 shrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 2 6 1.00 shrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 shrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 shrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 shrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 shsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 shsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sli d10, d14, #12 +# CHECK-NEXT: 1 3 0.50 sli v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 smax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 smax v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 smax v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 smaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 smaxp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 smaxp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 smin v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 smin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 smin v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 sminp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 sminp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 smlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 smlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 smlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 smlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 smlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 smlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 smull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 smull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 smull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sqabs b19, b14 +# CHECK-NEXT: 1 2 0.50 sqabs d18, d12 +# CHECK-NEXT: 1 2 0.50 sqabs h21, h15 +# CHECK-NEXT: 1 2 0.50 sqabs s20, s12 +# CHECK-NEXT: 1 2 0.50 sqabs v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sqabs v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 sqabs v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 sqabs v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 sqabs v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 sqabs v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 sqabs v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 sqadd b20, b11, b15 +# CHECK-NEXT: 1 2 0.50 sqadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sqadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sqdmlal d19, s24, s12 +# CHECK-NEXT: 1 3 0.50 sqdmlal d8, s9, v0.s[1] +# CHECK-NEXT: 1 3 0.50 sqdmlal s0, h0, v0.h[3] +# CHECK-NEXT: 1 3 0.50 sqdmlal s17, h27, h12 +# CHECK-NEXT: 1 3 0.50 sqdmlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sqdmlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sqdmlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 sqdmlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sqdmlsl d12, s23, s13 +# CHECK-NEXT: 1 3 0.50 sqdmlsl d8, s9, v0.s[1] +# CHECK-NEXT: 1 3 0.50 sqdmlsl s0, h0, v0.h[3] +# CHECK-NEXT: 1 3 0.50 sqdmlsl s14, h12, h25 +# CHECK-NEXT: 1 3 0.50 sqdmlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sqdmlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sqdmlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 sqdmlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sqdmulh h10, h11, h12 +# CHECK-NEXT: 1 3 0.50 sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 3 0.50 sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 3 0.50 sqdmulh s20, s21, s2 +# CHECK-NEXT: 1 3 0.50 sqdmulh v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sqdmulh v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 sqdmull d1, s1, v0.s[1] +# CHECK-NEXT: 1 3 0.50 sqdmull d15, s22, s12 +# CHECK-NEXT: 1 3 0.50 sqdmull s1, h1, v0.h[3] +# CHECK-NEXT: 1 3 0.50 sqdmull s12, h22, h12 +# CHECK-NEXT: 1 3 0.50 sqdmull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sqdmull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sqdmull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 sqdmull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 sqneg b19, b14 +# CHECK-NEXT: 1 2 0.50 sqneg d18, d12 +# CHECK-NEXT: 1 2 0.50 sqneg h21, h15 +# CHECK-NEXT: 1 2 0.50 sqneg s20, s12 +# CHECK-NEXT: 1 2 0.50 sqneg v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 sqneg v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 sqneg v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 sqneg v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 sqneg v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 sqneg v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 sqneg v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sqrdmulh h10, h11, h12 +# CHECK-NEXT: 1 3 0.50 sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 3 0.50 sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 3 0.50 sqrdmulh s20, s21, s2 +# CHECK-NEXT: 1 3 0.50 sqrdmulh v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sqrdmulh v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 sqrshl d31, d31, d31 +# CHECK-NEXT: 1 2 0.50 sqrshl h3, h4, h15 +# CHECK-NEXT: 1 2 0.50 sqrshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 sqrshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 sqrshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 sqrshrn b10, h13, #2 +# CHECK-NEXT: 1 3 0.50 sqrshrn h15, s10, #6 +# CHECK-NEXT: 1 3 0.50 sqrshrn s15, d12, #9 +# CHECK-NEXT: 1 2 0.50 sqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sqrshrun b17, h10, #6 +# CHECK-NEXT: 1 3 0.50 sqrshrun h10, s13, #15 +# CHECK-NEXT: 1 3 0.50 sqrshrun s22, d16, #31 +# CHECK-NEXT: 1 2 0.50 sqrshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 sqrshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 sqshl b11, b19, #7 +# CHECK-NEXT: 1 2 0.50 sqshl d15, d16, #51 +# CHECK-NEXT: 1 2 0.50 sqshl d31, d31, d31 +# CHECK-NEXT: 1 2 0.50 sqshl h13, h18, #11 +# CHECK-NEXT: 1 2 0.50 sqshl h3, h4, h15 +# CHECK-NEXT: 1 2 0.50 sqshl s14, s17, #22 +# CHECK-NEXT: 1 2 0.50 sqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 sqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 sqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 2 0.50 sqshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 sqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 2 0.50 sqshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 sqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 sqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 2 0.50 sqshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 sqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 sqshlu b15, b18, #6 +# CHECK-NEXT: 1 2 0.50 sqshlu d11, d13, #32 +# CHECK-NEXT: 1 2 0.50 sqshlu h19, h17, #6 +# CHECK-NEXT: 1 2 0.50 sqshlu s16, s14, #25 +# CHECK-NEXT: 1 2 0.50 sqshlu v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 sqshlu v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 sqshlu v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 2 0.50 sqshlu v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 2 0.50 sqshlu v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 sqshlu v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 2 0.50 sqshlu v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 sqshrn b10, h15, #5 +# CHECK-NEXT: 1 3 0.50 sqshrn h17, s10, #4 +# CHECK-NEXT: 1 3 0.50 sqshrn s18, d10, #31 +# CHECK-NEXT: 2 6 1.00 sqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 sqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 2 6 1.00 sqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 sqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 sqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 sqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sqshrun b15, h10, #7 +# CHECK-NEXT: 1 3 0.50 sqshrun h20, s14, #3 +# CHECK-NEXT: 1 3 0.50 sqshrun s10, d15, #15 +# CHECK-NEXT: 2 6 1.00 sqshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 sqshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: 2 6 1.00 sqshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 sqshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 2 6 1.00 sqshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 2 6 1.00 sqshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 sqsub s20, s10, s7 +# CHECK-NEXT: 1 2 0.50 sqsub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 sqsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 sqsub v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 sqxtn b18, h18 +# CHECK-NEXT: 1 2 0.50 sqxtn h20, s17 +# CHECK-NEXT: 1 2 0.50 sqxtn s19, d14 +# CHECK-NEXT: 1 2 0.50 sqxtn v0.2s, v0.2d +# CHECK-NEXT: 1 2 0.50 sqxtn v0.4h, v0.4s +# CHECK-NEXT: 1 2 0.50 sqxtn v0.8b, v0.8h +# CHECK-NEXT: 1 2 0.50 sqxtn2 v0.16b, v0.8h +# CHECK-NEXT: 1 2 0.50 sqxtn2 v0.4s, v0.2d +# CHECK-NEXT: 1 2 0.50 sqxtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 2 0.50 sqxtun b19, h14 +# CHECK-NEXT: 1 2 0.50 sqxtun h21, s15 +# CHECK-NEXT: 1 2 0.50 sqxtun s20, d12 +# CHECK-NEXT: 1 2 0.50 sqxtun v0.2s, v0.2d +# CHECK-NEXT: 1 2 0.50 sqxtun v0.4h, v0.4s +# CHECK-NEXT: 1 2 0.50 sqxtun v0.8b, v0.8h +# CHECK-NEXT: 1 2 0.50 sqxtun2 v0.16b, v0.8h +# CHECK-NEXT: 1 2 0.50 sqxtun2 v0.4s, v0.2d +# CHECK-NEXT: 1 2 0.50 sqxtun2 v0.8h, v0.4s +# CHECK-NEXT: 1 2 0.50 srhadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 srhadd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 srhadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 sri d10, d12, #14 +# CHECK-NEXT: 1 3 0.50 sri v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 srshl d16, d16, d16 +# CHECK-NEXT: 1 3 0.50 srshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 srshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 srshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 srshr d19, d18, #7 +# CHECK-NEXT: 1 3 0.50 srshr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 srsra d15, d11, #19 +# CHECK-NEXT: 1 2 0.50 srsra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 srsra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 srsra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 2 0.50 srsra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 2 0.50 srsra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 srsra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 2 0.50 srsra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 sshl d31, d31, d31 +# CHECK-NEXT: 1 2 0.50 sshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 sshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 sshll v0.2d, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sshll2 v0.4s, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 sshr d15, d16, #12 +# CHECK-NEXT: 1 3 0.50 sshr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 ssra d18, d12, #21 +# CHECK-NEXT: 1 2 0.50 ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 2 0.50 ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 2 0.50 ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 2 0.50 ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 ssubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 ssubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 ssubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 ssubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 ssubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 ssubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 ssubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 2 0.50 ssubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 2 0.50 ssubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 2 0.50 ssubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 ssubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 ssubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 2 2 1.00 * st1 { v0.16b }, [x0] +# CHECK-NEXT: 6 4 3.00 * st1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: 8 5 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: 4 3 2.00 * st1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: 6 4 3.00 * st1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: 8 5 4.00 * st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: 2 2 1.00 * st1 { v0.8h }, [x15], x2 +# CHECK-NEXT: 4 3 2.00 * st1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: 3 4 1.00 * st1 { v0.d }[1], [x0] +# CHECK-NEXT: 3 4 1.00 * st1 { v0.d }[1], [x0], #8 +# CHECK-NEXT: 6 5 2.00 * st2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: 6 6 2.00 * st2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: 6 5 2.00 * st2 { v0.s, v1.s }[3], [sp] +# CHECK-NEXT: 6 5 2.00 * st2 { v0.s, v1.s }[3], [sp], #8 +# CHECK-NEXT: 9 6 3.00 * st3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: 9 6 3.00 * st3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: 9 6 3.00 * st3 { v0.h, v1.h, v2.h }[7], [x15] +# CHECK-NEXT: 9 6 3.00 * st3 { v0.h, v1.h, v2.h }[7], [x15], #6 +# CHECK-NEXT: 14 9 4.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: 12 7 4.00 * st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: 12 7 4.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] +# CHECK-NEXT: 12 7 4.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 +# CHECK-NEXT: 1 2 0.50 sub d15, d5, d16 +# CHECK-NEXT: 1 2 0.50 sub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 suqadd b19, b14 +# CHECK-NEXT: 1 2 0.50 suqadd d18, d22 +# CHECK-NEXT: 1 2 0.50 suqadd h20, h15 +# CHECK-NEXT: 1 2 0.50 suqadd s21, s12 +# CHECK-NEXT: 1 2 0.50 suqadd v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 suqadd v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 suqadd v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 suqadd v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 suqadd v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 suqadd v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 suqadd v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 tbl v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: 2 4 1.00 tbl v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: 3 6 1.50 tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: 4 8 2.00 tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: 1 2 0.50 tbl v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: 2 4 1.00 tbl v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: 3 6 1.50 tbl v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: 4 8 2.00 tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: 1 2 0.50 tbx v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: 2 4 1.00 tbx v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: 3 6 1.50 tbx v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: 4 8 2.00 tbx v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: 1 2 0.50 tbx v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: 2 4 1.00 tbx v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: 3 6 1.50 tbx v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: 4 8 2.00 tbx v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: 1 2 0.50 trn1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 trn1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 trn1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 trn1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 trn1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 trn1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 trn1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 trn2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 trn2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 trn2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 trn2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 trn2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 trn2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 trn2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uaba v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 uabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 uabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 uabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 uabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 uabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 uabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 uabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 uabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uadalp v0.1d, v0.2s +# CHECK-NEXT: 1 2 0.50 uadalp v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 uadalp v0.2s, v0.4h +# CHECK-NEXT: 1 2 0.50 uadalp v0.4h, v0.8b +# CHECK-NEXT: 1 2 0.50 uadalp v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 uadalp v0.8h, v0.16b +# CHECK-NEXT: 1 2 0.50 uaddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 uaddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 uaddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 uaddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uaddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uaddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uaddlp v0.1d, v0.2s +# CHECK-NEXT: 1 2 0.50 uaddlp v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 uaddlp v0.2s, v0.4h +# CHECK-NEXT: 1 2 0.50 uaddlp v0.4h, v0.8b +# CHECK-NEXT: 1 2 0.50 uaddlp v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 uaddlp v0.8h, v0.16b +# CHECK-NEXT: 1 2 0.50 uaddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 2 0.50 uaddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 2 0.50 uaddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 2 0.50 uaddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 uaddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 uaddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 ucvtf d21, d14 +# CHECK-NEXT: 1 3 0.50 ucvtf d21, d14, #64 +# CHECK-NEXT: 1 3 0.50 ucvtf s22, s13 +# CHECK-NEXT: 1 3 0.50 ucvtf s22, s13, #32 +# CHECK-NEXT: 1 3 0.50 ucvtf v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 ucvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 ucvtf v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 ucvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 ucvtf v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 ucvtf v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 ucvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 ucvtf v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uhsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 umax v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 umax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 umax v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 umaxp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 umaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 umaxp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 umin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 umin v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 umin v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 uminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 uminp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 uminp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 umlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 umlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 umlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 umlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 umlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 umlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 umlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 umlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 umull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 umull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 umull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 umull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uqadd h0, h1, h5 +# CHECK-NEXT: 1 2 0.50 uqadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uqrshl b11, b20, b30 +# CHECK-NEXT: 1 2 0.50 uqrshl s23, s20, s16 +# CHECK-NEXT: 1 2 0.50 uqrshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uqrshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uqrshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uqrshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uqrshrn b10, h12, #5 +# CHECK-NEXT: 1 3 0.50 uqrshrn h12, s10, #14 +# CHECK-NEXT: 1 3 0.50 uqrshrn s10, d10, #25 +# CHECK-NEXT: 1 2 0.50 uqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 uqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 uqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 uqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 uqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 uqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 uqshl b11, b20, b30 +# CHECK-NEXT: 1 2 0.50 uqshl b18, b15, #6 +# CHECK-NEXT: 1 2 0.50 uqshl d15, d12, #19 +# CHECK-NEXT: 1 2 0.50 uqshl h11, h18, #7 +# CHECK-NEXT: 1 2 0.50 uqshl s14, s19, #18 +# CHECK-NEXT: 1 2 0.50 uqshl s23, s20, s16 +# CHECK-NEXT: 1 2 0.50 uqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 uqshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 uqshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 uqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 2 0.50 uqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 2 0.50 uqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 uqshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 2 0.50 uqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 uqshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uqshrn b12, h10, #7 +# CHECK-NEXT: 1 3 0.50 uqshrn h10, s14, #5 +# CHECK-NEXT: 1 3 0.50 uqshrn s10, d12, #13 +# CHECK-NEXT: 1 2 0.50 uqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 uqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 uqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 uqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 uqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 uqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 uqsub d16, d16, d16 +# CHECK-NEXT: 1 2 0.50 uqsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 2 6 1.00 uqxtn b18, h18 +# CHECK-NEXT: 2 6 1.00 uqxtn h20, s17 +# CHECK-NEXT: 2 6 1.00 uqxtn s19, d14 +# CHECK-NEXT: 2 6 1.00 uqxtn v0.2s, v0.2d +# CHECK-NEXT: 2 6 1.00 uqxtn v0.4h, v0.4s +# CHECK-NEXT: 2 6 1.00 uqxtn v0.8b, v0.8h +# CHECK-NEXT: 2 6 1.00 uqxtn2 v0.16b, v0.8h +# CHECK-NEXT: 2 6 1.00 uqxtn2 v0.4s, v0.2d +# CHECK-NEXT: 2 6 1.00 uqxtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 2 0.50 urecpe v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 urecpe v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 urhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 urhadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 urhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 urshl d8, d7, d4 +# CHECK-NEXT: 1 3 0.50 urshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 urshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 urshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 urshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 urshr d20, d23, #31 +# CHECK-NEXT: 1 3 0.50 urshr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 ursqrte v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 ursqrte v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 ursra d18, d10, #13 +# CHECK-NEXT: 1 2 0.50 ursra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 ursra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 ursra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 2 0.50 ursra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 2 0.50 ursra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 ursra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 2 0.50 ursra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 ushl d0, d0, d0 +# CHECK-NEXT: 1 2 0.50 ushl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 ushl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 ushl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 ushll v0.4s, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 ushll2 v0.8h, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ushr d10, d17, #18 +# CHECK-NEXT: 1 3 0.50 ushr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 usqadd b19, b14 +# CHECK-NEXT: 1 2 0.50 usqadd d18, d22 +# CHECK-NEXT: 1 2 0.50 usqadd h20, h15 +# CHECK-NEXT: 1 2 0.50 usqadd s21, s12 +# CHECK-NEXT: 1 2 0.50 usqadd v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 usqadd v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 usqadd v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 usqadd v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 usqadd v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 usqadd v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 usqadd v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 usra d20, d13, #61 +# CHECK-NEXT: 1 2 0.50 usra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 2 0.50 usra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 2 0.50 usra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 2 0.50 usra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 2 0.50 usra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 2 0.50 usra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 2 0.50 usra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 2 0.50 usubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 usubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 usubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 usubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 usubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 usubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 usubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 2 0.50 usubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 2 0.50 usubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 2 0.50 usubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 2 0.50 usubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 2 0.50 usubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 2 0.50 uzp1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uzp1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 uzp1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 uzp1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 uzp1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uzp1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 uzp1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 uzp2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 uzp2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 uzp2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 uzp2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 uzp2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 uzp2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 uzp2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 xtn v0.2s, v0.2d +# CHECK-NEXT: 1 2 0.50 xtn v0.4h, v0.4s +# CHECK-NEXT: 1 2 0.50 xtn v0.8b, v0.8h +# CHECK-NEXT: 1 2 0.50 xtn2 v0.16b, v0.8h +# CHECK-NEXT: 1 2 0.50 xtn2 v0.4s, v0.2d +# CHECK-NEXT: 1 2 0.50 xtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 2 0.50 zip1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 zip1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 zip1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 zip1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 zip1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 zip1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 zip1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 zip2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 zip2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 2 0.50 zip2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 zip2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 zip2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 zip2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 2 0.50 zip2 v0.8h, v0.8h, v0.8h + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - Ampere1BUnitA +# CHECK-NEXT: [0.1] - Ampere1BUnitA +# CHECK-NEXT: [1.0] - Ampere1BUnitB +# CHECK-NEXT: [1.1] - Ampere1BUnitB +# CHECK-NEXT: [2] - Ampere1BUnitBS +# CHECK-NEXT: [3.0] - Ampere1BUnitL +# CHECK-NEXT: [3.1] - Ampere1BUnitL +# CHECK-NEXT: [4.0] - Ampere1BUnitS +# CHECK-NEXT: [4.1] - Ampere1BUnitS +# CHECK-NEXT: [5] - Ampere1BUnitX +# CHECK-NEXT: [6] - Ampere1BUnitY +# CHECK-NEXT: [7] - Ampere1BUnitZ + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] +# CHECK-NEXT: - - - - 11.00 51.00 51.00 29.00 29.00 604.50 584.50 58.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4.0] [4.1] [5] [6] [7] Instructions: +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs d29, d24 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - abs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - add d17, d31, d29 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - add v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - addhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - addhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - addhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - addhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - addhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - addhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - addp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - addp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - and v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - bic v0.4h, #15, lsl #8 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - bic v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - bif v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - bit v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - bsl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cls v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cls v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cls v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cls v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cls v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cls v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - clz v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - clz v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - clz v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - clz v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - clz v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - clz v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmeq d20, d21, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmeq d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmeq v0.16b, v0.16b, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmeq v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmge d20, d21, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmge d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmge v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmge v0.8b, v0.8b, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmgt d20, d21, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmgt d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmgt v0.2s, v0.2s, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmhi d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmhi v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmhs d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmhs v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmle d20, d21, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmle v0.2d, v0.2d, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmlt d20, d21, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmlt v0.8h, v0.8h, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmtst d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cmtst v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cnt v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - cnt v0.8b, v0.8b +# CHECK-NEXT: - - - - 1.00 - - - - - - - dup v0.16b, w28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - dup v0.2d, x28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - dup v0.2s, w28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - dup v0.4h, w28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - dup v0.4s, w28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - dup v0.8b, w28 +# CHECK-NEXT: - - - - 1.00 - - - - - - - dup v0.8h, w28 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - eor v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ext v0.16b, v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ext v0.8b, v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabd d29, d24, d20 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabd s29, s24, s20 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fabs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - facge d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - facge s10, s11, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - facge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - facgt d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - facgt s10, s11, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - facgt v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - faddp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - faddp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmeq d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmeq d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmeq s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmeq s10, s11, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmeq v0.2s, v0.2s, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmeq v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmge d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmge d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmge s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmge s10, s11, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmge v0.2d, v0.2d, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmgt d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmgt d20, d21, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmgt s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmgt s10, s11, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmgt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmle d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmle s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmle v0.2d, v0.2d, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmlt d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmlt s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcmlt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtas d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtas s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtas v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtas v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtas v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtas v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtas v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtau d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtau s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtau v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtau v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtau v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtau v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtau v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtl v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtl v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtl2 v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtl2 v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtms d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtms s22, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtms v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtms v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtms v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtms v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtms v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtmu d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtmu s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtmu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtmu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtmu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtmu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtmu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtns d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtns s22, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtns v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtns v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtns v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtns v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtns v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtnu d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtnu s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtnu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtnu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtnu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtnu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtnu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtps d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtps s22, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtps v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtps v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtps v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtps v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtps v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtpu d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtpu s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtpu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtpu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtpu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtpu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtpu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtxn s22, d13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtxn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtxn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs d21, d12, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs s21, s12, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu d21, d12, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu s12, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu s21, s12, #1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fcvtzu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 - - fdiv v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmax v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmin v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmla d0, d1, v0.d[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmla s0, s1, v0.s[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmla v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmls d0, d4, v0.d[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmls s3, s5, v0.s[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmls v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov v0.2d, #-1.25000000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov v0.2s, #13.00000000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmov v0.4s, #1.00000000 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmul d0, d1, v0.d[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmul s0, s1, v0.s[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmul v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmulx d0, d4, v0.d[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmulx d23, d11, d1 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmulx s20, s22, s15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmulx s3, s5, v0.s[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmulx v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmulx v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fmulx v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fneg v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fneg v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fneg v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fneg v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fneg v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 - - frecpe d13, d13 +# CHECK-NEXT: - - - - - - - - - 1.00 - - frecpe s19, s14 +# CHECK-NEXT: - - - - - - - - - 1.00 - - frecpe v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 - - frecpe v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 1.00 - - frecpe v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 1.00 - - frecpe v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 1.00 - - frecpe v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frecps v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frecps d22, d30, d21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frecps s21, s16, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frecpx d16, d19 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frecpx s18, s10 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinta v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinta v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinta v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinta v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinta v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinti v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinti v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinti v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinti v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frinti v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintm v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintm v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintm v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintm v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintm v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintn v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintn v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintn v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintn v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintn v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintp v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintp v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintp v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintp v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintp v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintx v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintx v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintx v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintx v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintx v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintz v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintz v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintz v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintz v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frintz v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 - - frsqrte d21, d12 +# CHECK-NEXT: - - - - - - - - - 1.00 - - frsqrte s22, s13 +# CHECK-NEXT: - - - - - - - - - 1.00 - - frsqrte v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 - - frsqrte v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 1.00 - - frsqrte v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 1.00 - - frsqrte v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 1.00 - - frsqrte v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frsqrts d8, d22, d18 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frsqrts s21, s5, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - frsqrts v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 - - fsqrt v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 - - fsqrt v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 1.00 - - fsqrt v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 1.00 - - fsqrt v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 1.00 - - fsqrt v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - fsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ld1 { v0.16b }, [x0] +# CHECK-NEXT: - - - - - 1.50 1.50 - - - - - ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: - - - - - 2.00 2.00 - - - - - ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: - - - - - 1.00 1.00 - - - - - ld1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: - - - - - 1.50 1.50 - - - - - ld1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: - - - - - 2.00 2.00 - - - - - ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - ld1 { v0.8h }, [x15], x2 +# CHECK-NEXT: - - - - - 1.00 1.00 - - - - - ld1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: - - - - - 0.50 0.50 - - 0.50 0.50 - ld1 { v0.b }[9], [x0] +# CHECK-NEXT: - - - - - 0.50 0.50 - - 0.50 0.50 - ld1 { v0.b }[9], [x0], #1 +# CHECK-NEXT: - - - - - 0.50 0.50 - - 0.50 0.50 - ld1r { v0.16b }, [x0] +# CHECK-NEXT: - - - - - 0.50 0.50 - - 0.50 0.50 - ld1r { v0.16b }, [x0], #1 +# CHECK-NEXT: - - - - - 0.50 0.50 - - 0.50 0.50 - ld1r { v0.8h }, [x15] +# CHECK-NEXT: - - - - - 0.50 0.50 - - 0.50 0.50 - ld1r { v0.8h }, [x15], #2 +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.00 1.00 - ld2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.50 1.50 - ld2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.00 1.00 - ld2 { v0.h, v1.h }[7], [x15] +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.00 1.00 - ld2 { v0.h, v1.h }[7], [x15], #4 +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.00 1.00 - ld2r { v0.2d, v1.2d }, [x0] +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.00 1.00 - ld2r { v0.2d, v1.2d }, [x0], #16 +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.00 1.00 - ld2r { v0.4s, v1.4s }, [sp] +# CHECK-NEXT: - - - - - 1.00 1.00 - - 1.00 1.00 - ld2r { v0.4s, v1.4s }, [sp], #8 +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3 { v0.s, v1.s, v2.s }[3], [sp] +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3 { v0.s, v1.s, v2.s }[3], [sp], x3 +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3r { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3r { v0.4h, v1.4h, v2.4h }, [x15], #6 +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3r { v0.8b, v1.8b, v2.8b }, [x0] +# CHECK-NEXT: - - - - - 1.50 1.50 - - 1.50 1.50 - ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3 +# CHECK-NEXT: - - - - - 2.00 2.00 - - 2.00 2.00 - ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: - - - - - 2.00 2.00 - - 2.00 2.00 - ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: - - - - - 2.00 2.00 - - 2.00 2.00 - ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0] +# CHECK-NEXT: - - - - - 2.00 2.00 - - 2.00 2.00 - ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32 +# CHECK-NEXT: - - - - - 2.00 2.00 - - 2.00 2.00 - ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0 +# CHECK-NEXT: - - - - - 2.00 2.00 - - - - - ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp] +# CHECK-NEXT: - - - - - 2.00 2.00 - - - - - ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7 +# CHECK-NEXT: - - - - - 2.00 2.00 - - 2.00 2.00 - ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: - - - - - 2.00 2.00 - - 2.00 2.00 - ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mls v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov b0, v0.b[15] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov d6, v0.d[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov h2, v0.h[5] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov s17, v0.s[2] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov v2.b[0], v0.b[0] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov v2.h[1], v0.h[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov v2.s[2], v0.s[2] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov v2.d[1], v0.d[1] +# CHECK-NEXT: - - - - 1.00 - - - - 0.50 0.50 - mov v0.b[0], w8 +# CHECK-NEXT: - - - - 1.00 - - - - 0.50 0.50 - mov v0.h[1], w8 +# CHECK-NEXT: - - - - 1.00 - - - - 0.50 0.50 - mov v0.s[2], w8 +# CHECK-NEXT: - - - - 1.00 - - - - 0.50 0.50 - mov v0.d[1], x8 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - movi d15, #0xff00ff00ff00ff +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - movi v0.16b, #31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - movi v0.2d, #0xff0000ff0000ffff +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - movi v0.2s, #8, msl #8 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - movi v0.4s, #255, lsl #24 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - movi v0.8b, #255 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mvni v0.2s, #0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mvni v0.4s, #16, msl #16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg d29, d24 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - neg v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mvn v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mvn v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - orn v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - mov v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - orr v0.8h, #31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - pmul v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - pmul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - pmull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - pmull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - raddhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - raddhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - raddhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - raddhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - raddhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - raddhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rbit v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rbit v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev16 v21.8b, v1.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev16 v30.16b, v31.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev32 v0.4h, v9.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev32 v21.8b, v1.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev32 v30.16b, v31.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev32 v4.8h, v7.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev64 v0.16b, v31.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev64 v1.8b, v9.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev64 v13.4h, v21.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev64 v2.8h, v4.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev64 v4.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - rev64 v6.4s, v8.4s +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rsubhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rsubhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rsubhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rsubhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rsubhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - rsubhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saba v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sadalp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sadalp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sadalp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sadalp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sadalp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sadalp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddlp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddlp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddlp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddlp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddlp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddlp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - saddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf d21, d12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf d21, d12, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf s22, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf s22, s13, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - scvtf v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shl d7, d10, #12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shl v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shl v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shl v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shl v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll v0.2d, v0.2s, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll v0.4s, v0.4h, #16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll v0.8h, v0.8b, #8 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll v0.2d, v0.2s, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll v0.4s, v0.4h, #16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll v0.8h, v0.8b, #8 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll2 v0.2d, v0.4s, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll2 v0.4s, v0.8h, #16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll2 v0.8h, v0.16b, #8 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll2 v0.2d, v0.4s, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll2 v0.4s, v0.8h, #16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shll2 v0.8h, v0.16b, #8 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - shrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - shrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - shrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - shrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - shrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - shrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - shsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli d10, d14, #12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sli v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smax v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smax v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smaxp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smaxp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smin v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smin v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sminp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sminp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - smull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs b19, b14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs d18, d12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs h21, h15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs s20, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqabs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqadd b20, b11, b15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal d19, s24, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal d8, s9, v0.s[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal s0, h0, v0.h[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal s17, h27, h12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl d12, s23, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl d8, s9, v0.s[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl s0, h0, v0.h[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl s14, h12, h25 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmulh v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmulh v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull d1, s1, v0.s[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull d15, s22, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull s1, h1, v0.h[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull s12, h22, h12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqdmull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg b19, b14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg d18, d12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg h21, h15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg s20, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqneg v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrdmulh v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrdmulh v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshl d31, d31, d31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshl h3, h4, h15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn b10, h13, #2 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn h15, s10, #6 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn s15, d12, #9 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun b17, h10, #6 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun h10, s13, #15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun s22, d16, #31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqrshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl b11, b19, #7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl d15, d16, #51 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl d31, d31, d31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl h13, h18, #11 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl h3, h4, h15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl s14, s17, #22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu b15, b18, #6 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu d11, d13, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu h19, h17, #6 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu s16, s14, #25 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshlu v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshrn b10, h15, #5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshrn h17, s10, #4 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshrn s18, d10, #31 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshrun b15, h10, #7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshrun h20, s14, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqshrun s10, d15, #15 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - sqshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqsub s20, s10, s7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqsub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqsub v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn b18, h18 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn h20, s17 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn s19, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun b19, h14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun h21, s15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun s20, d12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sqxtun2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srhadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srhadd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srhadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri d10, d12, #14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sri v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshl d16, d16, d16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr d19, d18, #7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srshr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra d15, d11, #19 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - srsra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshl d31, d31, d31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshll v0.2d, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshll2 v0.4s, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr d15, d16, #12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sshr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra d18, d12, #21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ssubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 st1 { v0.16b }, [x0] +# CHECK-NEXT: - - - - - - - 1.50 1.50 - - 3.00 st1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: - - - - - - - 2.00 2.00 - - 4.00 st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 st1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: - - - - - - - 1.50 1.50 - - 3.00 st1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: - - - - - - - 2.00 2.00 - - 4.00 st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: - - - - - - - 0.50 0.50 - - 1.00 st1 { v0.8h }, [x15], x2 +# CHECK-NEXT: - - - - - - - 1.00 1.00 - - 2.00 st1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: - - - - - - - 0.50 0.50 0.50 0.50 1.00 st1 { v0.d }[1], [x0] +# CHECK-NEXT: - - - - - - - 0.50 0.50 0.50 0.50 1.00 st1 { v0.d }[1], [x0], #8 +# CHECK-NEXT: - - - - - - - 1.00 1.00 1.00 1.00 2.00 st2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: - - - - - - - 1.00 1.00 1.00 1.00 2.00 st2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: - - - - - - - 1.00 1.00 1.00 1.00 2.00 st2 { v0.s, v1.s }[3], [sp] +# CHECK-NEXT: - - - - - - - 1.00 1.00 1.00 1.00 2.00 st2 { v0.s, v1.s }[3], [sp], #8 +# CHECK-NEXT: - - - - - - - 1.50 1.50 1.50 1.50 3.00 st3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: - - - - - - - 1.50 1.50 1.50 1.50 3.00 st3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: - - - - - - - 1.50 1.50 1.50 1.50 3.00 st3 { v0.h, v1.h, v2.h }[7], [x15] +# CHECK-NEXT: - - - - - - - 1.50 1.50 1.50 1.50 3.00 st3 { v0.h, v1.h, v2.h }[7], [x15], #6 +# CHECK-NEXT: - - - - - - - 2.00 2.00 3.00 3.00 4.00 st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: - - - - - - - 2.00 2.00 2.00 2.00 4.00 st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: - - - - - - - 2.00 2.00 2.00 2.00 4.00 st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] +# CHECK-NEXT: - - - - - - - 2.00 2.00 2.00 2.00 4.00 st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sub d15, d5, d16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - sub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd b19, b14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd d18, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd h20, h15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd s21, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - suqadd v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - tbl v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - tbl v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 1.50 1.50 - tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 2.00 2.00 - tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - tbl v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - tbl v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 1.50 1.50 - tbl v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 2.00 2.00 - tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - tbx v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - tbx v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 1.50 1.50 - tbx v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 2.00 2.00 - tbx v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - tbx v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - tbx v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 1.50 1.50 - tbx v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 2.00 2.00 - tbx v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - trn2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaba v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uadalp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uadalp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uadalp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uadalp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uadalp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uadalp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddlp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddlp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddlp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddlp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddlp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddlp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uaddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf d21, d14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf d21, d14, #64 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf s22, s13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf s22, s13, #32 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ucvtf v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uhsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umax v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umax v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umaxp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umaxp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umin v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umin v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uminp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uminp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - umull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqadd h0, h1, h5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshl b11, b20, b30 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshl s23, s20, s16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn b10, h12, #5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn h12, s10, #14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn s10, d10, #25 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl b11, b20, b30 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl b18, b15, #6 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl d15, d12, #19 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl h11, h18, #7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl s14, s19, #18 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl s23, s20, s16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn b12, h10, #7 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn h10, s14, #5 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn s10, d12, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqsub d16, d16, d16 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uqsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn b18, h18 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn h20, s17 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn s19, d14 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - uqxtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urecpe v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urecpe v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urhadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshl d8, d7, d4 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr d20, d23, #31 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - urshr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursqrte v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursqrte v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra d18, d10, #13 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ursra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushl d0, d0, d0 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushll v0.4s, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushll2 v0.8h, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr d10, d17, #18 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - ushr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd b19, b14 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd d18, d22 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd h20, h15 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd s21, s12 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usqadd v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra d20, d13, #61 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - usubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - uzp2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - xtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - xtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - xtn v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - xtn2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - xtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - xtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - zip2 v0.8h, v0.8h, v0.8h diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/shifted-register.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/shifted-register.s new file mode 100644 index 00000000000000..27e0279a701013 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/shifted-register.s @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=ampere1b -resource-pressure=false < %s | FileCheck %s + + add w0, w1, w2, lsl #0 + sub x3, x4, x5, lsl #1 + adds x6, x7, x8, lsr #2 + subs x9, x10, x11, asr #3 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 156 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 12 +# CHECK-NEXT: uOps Per Cycle: 3.85 +# CHECK-NEXT: IPC: 2.56 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 add w0, w1, w2 +# CHECK-NEXT: 1 1 0.25 sub x3, x4, x5, lsl #1 +# CHECK-NEXT: 2 2 0.50 adds x6, x7, x8, lsr #2 +# CHECK-NEXT: 2 2 0.50 subs x9, x10, x11, asr #3 diff --git a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test index fc5856691f8dca..f88b7575002a94 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test +++ b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test @@ -48,6 +48,9 @@ # RUN: llvm-objcopy -I binary -O elf64-loongarch %t.txt %t.la64.o # RUN: llvm-readobj --file-headers %t.la64.o | FileCheck %s --check-prefixes=CHECK,LE,LA64,64 +# RUN: llvm-objcopy -I binary -O elf64-s390 %t.txt %t.s390x.o +# RUN: llvm-readobj --file-headers %t.s390x.o | FileCheck %s --check-prefixes=CHECK,BE,S390X,64 + # CHECK: Format: # 32-SAME: elf32- # 64-SAME: elf64- @@ -64,6 +67,7 @@ # PPCLE-SAME: powerpcle{{$}} # SPARC-SAME: sparc # SPARCEL-SAME: sparc +# S390X-SAME: s390 # X86-64-SAME: x86-64 # AARCH64-NEXT: Arch: aarch64 @@ -81,6 +85,7 @@ # RISCV64-NEXT: Arch: riscv64 # SPARC-NEXT: Arch: sparc{{$}} # SPARCEL-NEXT: Arch: sparcel +# S390X-NEXT: Arch: s390x # X86-64-NEXT: Arch: x86_64 # 32-NEXT: AddressSize: 32bit @@ -116,6 +121,7 @@ # RISCV64-NEXT: Machine: EM_RISCV (0xF3) # SPARC-NEXT: Machine: EM_SPARC (0x2) # SPARCEL-NEXT: Machine: EM_SPARC (0x2) +# S390X-NEXT: Machine: EM_S390 (0x16) # X86-64-NEXT: Machine: EM_X86_64 (0x3E) # CHECK-NEXT: Version: 1 diff --git a/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test b/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test index 882940c05e19c2..9a8128611792d5 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test +++ b/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test @@ -117,6 +117,10 @@ # RUN: llvm-readobj --file-headers %t.elf64_loongarch.o | FileCheck %s --check-prefixes=CHECK,LE,LA64,64,SYSV # RUN: llvm-readobj --file-headers %t.elf64_loongarch.dwo | FileCheck %s --check-prefixes=CHECK,LE,LA64,64,SYSV +# RUN: llvm-objcopy %t.o -O elf64-s390 %t.elf64_s390.o --split-dwo=%t.elf64_s390.dwo +# RUN: llvm-readobj --file-headers %t.elf64_s390.o | FileCheck %s --check-prefixes=CHECK,BE,S390X,64,SYSV +# RUN: llvm-readobj --file-headers %t.elf64_s390.dwo | FileCheck %s --check-prefixes=CHECK,BE,S390X,64,SYSV + !ELF FileHeader: Class: ELFCLASS32 @@ -160,6 +164,7 @@ Symbols: # RISCV32-SAME: riscv{{$}} # RISCV64-SAME: riscv{{$}} # SPARC-SAME: sparc +# S390X-SAME: s390 # X86-64-SAME: x86-64 # DEFAULT-SAME: unknown @@ -182,6 +187,7 @@ Symbols: # RISCV64-NEXT: Arch: riscv64 # SPARC-NEXT: Arch: sparc{{$}} # SPARCEL-NEXT: Arch: sparcel +# S390X-NEXT: Arch: s390x # X86-64-NEXT: Arch: x86_64 # DEFAULT-NEXT: Arch: unknown @@ -210,6 +216,7 @@ Symbols: # RISCV32: Machine: EM_RISCV (0xF3) # RISCV64: Machine: EM_RISCV (0xF3) # SPARC: Machine: EM_SPARC (0x2) +# S390X: Machine: EM_S390 (0x16) # X86-64: Machine: EM_X86_64 (0x3E) # 32: HeaderSize: 52 diff --git a/llvm/test/tools/llvm-objdump/openbsd-headers.test b/llvm/test/tools/llvm-objdump/openbsd-headers.test index f547854feeeedf..84fa59bdf89f5c 100644 --- a/llvm/test/tools/llvm-objdump/openbsd-headers.test +++ b/llvm/test/tools/llvm-objdump/openbsd-headers.test @@ -11,6 +11,8 @@ # CHECK-NEXT: filesz 0x0000000000000000 memsz 0x0000000000000000 flags --- # CHECK-NEXT: OPENBSD_NOBTCFI off 0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0 # CHECK-NEXT: filesz 0x0000000000000000 memsz 0x0000000000000000 flags --- +# CHECK-NEXT: OPENBSD_SYSCALLS off 0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0 +# CHECK-NEXT: filesz 0x0000000000000000 memsz 0x0000000000000000 flags --- # CHECK-NEXT: OPENBSD_BOOTDATA off 0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0 # CHECK-NEXT: filesz 0x0000000000000000 memsz 0x0000000000000000 flags --- @@ -25,4 +27,5 @@ ProgramHeaders: - Type: 0x65a3dbe6 ## PT_OPENBSD_RANDOMIZE - Type: 0x65a3dbe7 ## PT_OPENBSD_WXNEEDED - Type: 0x65a3dbe8 ## PT_OPENBSD_NOBTCFI + - Type: 0x65a3dbe9 ## PT_OPENBSD_SYSCALLS - Type: 0x65a41be6 ## PT_OPENBSD_BOOTDATA diff --git a/llvm/test/tools/llvm-readobj/COFF/file-headers.test b/llvm/test/tools/llvm-readobj/COFF/file-headers.test index b83a6cf5b972b3..32f39e196b0001 100644 --- a/llvm/test/tools/llvm-readobj/COFF/file-headers.test +++ b/llvm/test/tools/llvm-readobj/COFF/file-headers.test @@ -323,6 +323,7 @@ symbols: # IMPORTLIB:Format: COFF-import-file-i386 # IMPORTLIB-NEXT:Type: code # IMPORTLIB-NEXT:Name type: noprefix +# IMPORTLIB-NEXT:Export name: func # IMPORTLIB-NEXT:Symbol: __imp__func # IMPORTLIB-NEXT:Symbol: _func # IMPORTLIB-NOT:{{.}} diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test new file mode 100644 index 00000000000000..f4c73de7ca6c9d --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test @@ -0,0 +1,32 @@ +# UNSUPPORTED: zlib +# RUN: yaml2obj %s -o %t +# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t + +# CHECK: String dump of section '.a': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time +# CHECK-NEXT: [ 0] . +# CHECK-NEXT: [ 8] . +# CHECK-NEXT: [ 10] . +# CHECK-NEXT: [ 18] x.c. +# CHECK-NEXT: [ 1e] . +# CHECK-NEXT: [ 20] . +# CHECK-NEXT: Hex dump of section '.b': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time +# CHECK-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................ +# CHECK-NEXT: 0x00000010 01000000 00000000 789c6304 00000200 ........x.c..... +# CHECK-NEXT: 0x00000020 02 . + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .a + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 010000000000000001000000000000000100000000000000789c63040000020002 + - Name: .b + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 010000000000000001000000000000000100000000000000789c63040000020002 diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test new file mode 100644 index 00000000000000..ea7a8854eb1a0c --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test @@ -0,0 +1,76 @@ +# REQUIRES: zlib +## Test --decompress/-z. + +# RUN: yaml2obj %s -o %t + +# RUN: llvm-readelf -z -x .strings -x .not_null_terminated %t | FileCheck %s --check-prefix=HEX +# RUN: llvm-readobj --decompress -p .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=STR + +# HEX: Hex dump of section '.strings': +# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st +# HEX-NEXT: 0x00000010 72696e67 7300 rings. +# HEX: Hex dump of section '.not_null_terminated': +# HEX-NEXT: 0x00000000 6e6f006e 756c6c no.null + +# STR: String dump of section '.strings': +# STR-NEXT: [ 0] here +# STR-NEXT: [ 5] are +# STR-NEXT: [ 9] some +# STR-NEXT: [ e] strings +# STR-EMPTY: +# STR-NEXT: String dump of section '.not_null_terminated': +# STR-NEXT: [ 0] no +# STR-NEXT: [ 3] null{{$}} +# STR-NOT: {{.}} + +# RUN: llvm-readobj -x .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=COMPRESSED + +# COMPRESSED: String dump of section '.not_null_terminated': +# COMPRESSED-NEXT: [ 0] no +# COMPRESSED-NEXT: [ 3] null +# COMPRESSED-NEXT: Hex dump of section '.strings': +# COMPRESSED-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................ +# COMPRESSED-NEXT: 0x00000010 00000000 00000000 789ccb48 2d4a6548 ........x..H-JeH +# COMPRESSED-NEXT: 0x00000020 04e2e2fc 5c205152 9499975e cc000058 ....\ QR...^...X +# COMPRESSED-NEXT: 0x00000030 2e079b ... + +# RUN: llvm-readelf -z -p .invalid1 -x .invalid2 -x .invalid3 %t 2>&1 | FileCheck %s -DFILE=%t --check-prefix=INVALID + +# INVALID: String dump of section '.invalid1': +# INVALID-NEXT: warning: '[[FILE]]': corrupted compressed section header +# INVALID-NEXT: [ 0] . +# INVALID-NEXT: Hex dump of section '.invalid2': +# INVALID-NEXT: warning: '[[FILE]]': zlib error: Z_DATA_ERROR +# INVALID-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................ +# INVALID-NEXT: 0x00000010 00000000 00000000 78 ........x +# INVALID-EMPTY: +# INVALID-NEXT: Hex dump of section '.invalid3': +# INVALID-NEXT: warning: '[[FILE]]': unsupported compression type (3) +# INVALID-NEXT: 0x00000000 03000000 00000000 04000000 00000000 ................ +# INVALID-NEXT: 0x00000010 00000000 00000000 789c6360 ........x.c` + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .strings + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 010000000000000016000000000000000000000000000000789ccb482d4a654804e2e2fc5c2051529499975ecc0000582e079b + - Name: .not_null_terminated + Type: SHT_PROGBITS + Content: 6e6f006e756c6c + - Name: .invalid1 + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 01 + - Name: .invalid2 + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 01000000000000001600000000000000000000000000000078 + - Name: .invalid3 + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 030000000000000004000000000000000000000000000000789c6360 diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test new file mode 100644 index 00000000000000..65da952687f526 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test @@ -0,0 +1,31 @@ +# UNSUPPORTED: zstd +# RUN: yaml2obj %s -o %t +# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t + +# CHECK: String dump of section '.a': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time +# CHECK-NEXT: [ 0] . +# CHECK-NEXT: [ 8] . +# CHECK-NEXT: [ 10] . +# CHECK-NEXT: [ 18] (./. .. +# CHECK-NEXT: [ 21] . +# CHECK-NEXT: Hex dump of section '.b': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time +# CHECK-NEXT: 0x00000000 02000000 00000000 01000000 00000000 ................ +# CHECK-NEXT: 0x00000010 01000000 00000000 28b52ffd 20010900 ........(./. ... +# CHECK-NEXT: 0x00000020 0001 .. + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .a + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001 + - Name: .b + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001 diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test new file mode 100644 index 00000000000000..519db879b18c17 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test @@ -0,0 +1,28 @@ +# REQUIRES: zstd +## Test --decompress/-z for zstd. + +# RUN: yaml2obj %s -o %t + +# RUN: llvm-readelf -z -x .strings %t | FileCheck %s --check-prefix=HEX +# RUN: llvm-readobj --decompress -p .strings %t | FileCheck %s --check-prefix=STR + +# HEX: Hex dump of section '.strings': +# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st +# HEX-NEXT: 0x00000010 72696e67 7300 rings. + +# STR: String dump of section '.strings': +# STR-NEXT: [ 0] here +# STR-NEXT: [ 5] are +# STR-NEXT: [ 9] some +# STR-NEXT: [ e] strings + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .strings + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 02000000000000001600000000000000000000000000000028b52ffd2016b10000686572650061726500736f6d6500737472696e677300 diff --git a/llvm/test/tools/llvm-readobj/ELF/program-headers.test b/llvm/test/tools/llvm-readobj/ELF/program-headers.test index 702a06b6403f0a..856cf378ddad95 100644 --- a/llvm/test/tools/llvm-readobj/ELF/program-headers.test +++ b/llvm/test/tools/llvm-readobj/ELF/program-headers.test @@ -29,68 +29,70 @@ # RUN: FileCheck %s --check-prefixes=ELF64,MAPPING --strict-whitespace --match-full-lines # RUN: llvm-readobj -l %t64.elf | FileCheck %s --check-prefixes=ELF-LLVM,ELF64-LLVM -# ELF32:There are 25 program headers, starting at offset 52 +# ELF32:There are 26 program headers, starting at offset 52 # ELF32-EMPTY: # ELF32-NEXT:Program Headers: # ELF32-NEXT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -# ELF32-NEXT: PHDR 0x000354 0x00001000 0x00001000 0x00003 0x00003 W 0x1 -# ELF32-NEXT: PHDR 0x000357 0x00002000 0x00002000 0x00007 0x00007 E 0x1 -# ELF32-NEXT: NULL 0x000357 0x00002000 0x00002000 0x00007 0x00007 E 0x1 -# ELF32-NEXT: DYNAMIC 0x000354 0x00001000 0x00001000 0x00003 0x00003 RWE 0x1 -# ELF32-NEXT: INTERP 0x00035e 0x00003000 0x00003000 0x00004 0x00004 RW 0x1 +# ELF32-NEXT: PHDR 0x000374 0x00001000 0x00001000 0x00003 0x00003 W 0x1 +# ELF32-NEXT: PHDR 0x000377 0x00002000 0x00002000 0x00007 0x00007 E 0x1 +# ELF32-NEXT: NULL 0x000377 0x00002000 0x00002000 0x00007 0x00007 E 0x1 +# ELF32-NEXT: DYNAMIC 0x000374 0x00001000 0x00001000 0x00003 0x00003 RWE 0x1 +# ELF32-NEXT: INTERP 0x00037e 0x00003000 0x00003000 0x00004 0x00004 RW 0x1 # ELF32-NEXT: [Requesting program interpreter: ABC] -# ELF32-NEXT: NOTE 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: SHLIB 0x000354 0x00001000 0x00001000 0x00001 0x00001 0x1 -# ELF32-NEXT: TLS 0x000362 0x00004000 0x00004000 0x00001 0x00001 0x1 -# ELF32-NEXT: : 0x60000000 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: GNU_EH_FRAME 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: SUNW_UNWIND 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: GNU_STACK 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: GNU_RELRO 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: GNU_PROPERTY 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: OPENBSD_MUTABLE 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: OPENBSD_RANDOMIZE 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: OPENBSD_WXNEEDED 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: OPENBSD_NOBTCFI 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: OPENBSD_BOOTDATA 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: : 0x6fffffff 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: : 0x70000000 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: : 0x70000001 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: : 0x70000002 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: : 0x70000003 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 -# ELF32-NEXT: : 0x7fffffff 0x000354 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: NOTE 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: SHLIB 0x000374 0x00001000 0x00001000 0x00001 0x00001 0x1 +# ELF32-NEXT: TLS 0x000382 0x00004000 0x00004000 0x00001 0x00001 0x1 +# ELF32-NEXT: : 0x60000000 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: GNU_EH_FRAME 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: SUNW_UNWIND 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: GNU_STACK 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: GNU_RELRO 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: GNU_PROPERTY 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: OPENBSD_MUTABLE 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: OPENBSD_RANDOMIZE 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: OPENBSD_WXNEEDED 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: OPENBSD_NOBTCFI 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: OPENBSD_SYSCALLS 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: OPENBSD_BOOTDATA 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: : 0x6fffffff 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: : 0x70000000 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: : 0x70000001 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: : 0x70000002 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: : 0x70000003 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 +# ELF32-NEXT: : 0x7fffffff 0x000374 0x00001000 0x00001000 0x00003 0x00003 0x1 # ELF32-EMPTY: -# ELF64:There are 25 program headers, starting at offset 64 +# ELF64:There are 26 program headers, starting at offset 64 # ELF64-EMPTY: # ELF64-NEXT:Program Headers: # ELF64-NEXT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -# ELF64-NEXT: PHDR 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 W 0x1 -# ELF64-NEXT: PHDR 0x0005bb 0x0000000000002000 0x0000000000002000 0x000007 0x000007 E 0x1 -# ELF64-NEXT: NULL 0x0005bb 0x0000000000002000 0x0000000000002000 0x000007 0x000007 E 0x1 -# ELF64-NEXT: DYNAMIC 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 RWE 0x1 -# ELF64-NEXT: INTERP 0x0005c2 0x0000000000003000 0x0000000000003000 0x000004 0x000004 RW 0x1 +# ELF64-NEXT: PHDR 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 W 0x1 +# ELF64-NEXT: PHDR 0x0005f3 0x0000000000002000 0x0000000000002000 0x000007 0x000007 E 0x1 +# ELF64-NEXT: NULL 0x0005f3 0x0000000000002000 0x0000000000002000 0x000007 0x000007 E 0x1 +# ELF64-NEXT: DYNAMIC 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 RWE 0x1 +# ELF64-NEXT: INTERP 0x0005fa 0x0000000000003000 0x0000000000003000 0x000004 0x000004 RW 0x1 # ELF64-NEXT: [Requesting program interpreter: ABC] -# ELF64-NEXT: NOTE 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: SHLIB 0x0005b8 0x0000000000001000 0x0000000000001000 0x000001 0x000001 0x1 -# ELF64-NEXT: TLS 0x0005c6 0x0000000000004000 0x0000000000004000 0x000001 0x000001 0x1 -# ELF64-NEXT: : 0x60000000 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: GNU_EH_FRAME 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: SUNW_UNWIND 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: GNU_STACK 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: GNU_RELRO 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: GNU_PROPERTY 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: OPENBSD_MUTABLE 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: OPENBSD_RANDOMIZE 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: OPENBSD_WXNEEDED 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: OPENBSD_NOBTCFI 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: OPENBSD_BOOTDATA 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: : 0x6fffffff 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: : 0x70000000 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: : 0x70000001 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: : 0x70000002 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: : 0x70000003 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ELF64-NEXT: : 0x7fffffff 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: NOTE 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: SHLIB 0x0005f0 0x0000000000001000 0x0000000000001000 0x000001 0x000001 0x1 +# ELF64-NEXT: TLS 0x0005fe 0x0000000000004000 0x0000000000004000 0x000001 0x000001 0x1 +# ELF64-NEXT: : 0x60000000 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: GNU_EH_FRAME 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: SUNW_UNWIND 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: GNU_STACK 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: GNU_RELRO 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: GNU_PROPERTY 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: OPENBSD_MUTABLE 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: OPENBSD_RANDOMIZE 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: OPENBSD_WXNEEDED 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: OPENBSD_NOBTCFI 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: OPENBSD_SYSCALLS 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: OPENBSD_BOOTDATA 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: : 0x6fffffff 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: : 0x70000000 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: : 0x70000001 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: : 0x70000002 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: : 0x70000003 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ELF64-NEXT: : 0x7fffffff 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 # ELF64-EMPTY: # MAPPING: Section to Segment mapping: @@ -120,13 +122,14 @@ # MAPPING-NEXT: 22 .foo.begin .foo.end {{$}} # MAPPING-NEXT: 23 .foo.begin .foo.end {{$}} # MAPPING-NEXT: 24 .foo.begin .foo.end {{$}} +# MAPPING-NEXT: 25 .foo.begin .foo.end {{$}} # MAPPING-NEXT: None .unused .strtab .shstrtab {{$}} # ELF-LLVM: ProgramHeaders [ # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_PHDR (0x6) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -138,8 +141,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_PHDR (0x6) -# ELF32-LLVM-NEXT: Offset: 0x357 -# ELF64-LLVM-NEXT: Offset: 0x5BB +# ELF32-LLVM-NEXT: Offset: 0x377 +# ELF64-LLVM-NEXT: Offset: 0x5F3 # ELF-LLVM-NEXT: VirtualAddress: 0x2000 # ELF-LLVM-NEXT: PhysicalAddress: 0x2000 # ELF-LLVM-NEXT: FileSize: 7 @@ -151,8 +154,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_NULL (0x0) -# ELF32-LLVM-NEXT: Offset: 0x357 -# ELF64-LLVM-NEXT: Offset: 0x5BB +# ELF32-LLVM-NEXT: Offset: 0x377 +# ELF64-LLVM-NEXT: Offset: 0x5F3 # ELF-LLVM-NEXT: VirtualAddress: 0x2000 # ELF-LLVM-NEXT: PhysicalAddress: 0x2000 # ELF-LLVM-NEXT: FileSize: 7 @@ -164,8 +167,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_DYNAMIC (0x2) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -179,8 +182,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_INTERP (0x3) -# ELF32-LLVM-NEXT: Offset: 0x35E -# ELF64-LLVM-NEXT: Offset: 0x5C2 +# ELF32-LLVM-NEXT: Offset: 0x37E +# ELF64-LLVM-NEXT: Offset: 0x5FA # ELF-LLVM-NEXT: VirtualAddress: 0x3000 # ELF-LLVM-NEXT: PhysicalAddress: 0x3000 # ELF-LLVM-NEXT: FileSize: 4 @@ -193,8 +196,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_NOTE (0x4) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -205,8 +208,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_SHLIB (0x5) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 1 @@ -217,8 +220,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_TLS (0x7) -# ELF32-LLVM-NEXT: Offset: 0x362 -# ELF64-LLVM-NEXT: Offset: 0x5C6 +# ELF32-LLVM-NEXT: Offset: 0x382 +# ELF64-LLVM-NEXT: Offset: 0x5FE # ELF-LLVM-NEXT: VirtualAddress: 0x4000 # ELF-LLVM-NEXT: PhysicalAddress: 0x4000 # ELF-LLVM-NEXT: FileSize: 1 @@ -229,8 +232,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: Unknown (0x60000000) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -241,8 +244,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_GNU_EH_FRAME (0x6474E550) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -253,8 +256,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_SUNW_UNWIND (0x6464E550) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -265,8 +268,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_GNU_STACK (0x6474E551) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -277,8 +280,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_GNU_RELRO (0x6474E552) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -289,8 +292,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_GNU_PROPERTY (0x6474E553) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -301,8 +304,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_OPENBSD_MUTABLE (0x65A3DBE5) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -313,8 +316,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_OPENBSD_RANDOMIZE (0x65A3DBE6) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -325,8 +328,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_OPENBSD_WXNEEDED (0x65A3DBE7) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -337,8 +340,20 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_OPENBSD_NOBTCFI (0x65A3DBE8) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 +# ELF-LLVM-NEXT: VirtualAddress: 0x1000 +# ELF-LLVM-NEXT: PhysicalAddress: 0x1000 +# ELF-LLVM-NEXT: FileSize: 3 +# ELF-LLVM-NEXT: MemSize: 3 +# ELF-LLVM-NEXT: Flags [ (0x0) +# ELF-LLVM-NEXT: ] +# ELF-LLVM-NEXT: Alignment: 1 +# ELF-LLVM-NEXT: } +# ELF-LLVM-NEXT: ProgramHeader { +# ELF-LLVM-NEXT: Type: PT_OPENBSD_SYSCALLS (0x65A3DBE9) +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -349,8 +364,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: PT_OPENBSD_BOOTDATA (0x65A41BE6) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -361,8 +376,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: Unknown (0x6FFFFFFF) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -373,8 +388,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: Unknown (0x70000000) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -385,8 +400,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: Unknown (0x70000001) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -397,8 +412,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: Unknown (0x70000002) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -409,8 +424,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: Unknown (0x70000003) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -421,8 +436,8 @@ # ELF-LLVM-NEXT: } # ELF-LLVM-NEXT: ProgramHeader { # ELF-LLVM-NEXT: Type: Unknown (0x7FFFFFFF) -# ELF32-LLVM-NEXT: Offset: 0x354 -# ELF64-LLVM-NEXT: Offset: 0x5B8 +# ELF32-LLVM-NEXT: Offset: 0x374 +# ELF64-LLVM-NEXT: Offset: 0x5F0 # ELF-LLVM-NEXT: VirtualAddress: 0x1000 # ELF-LLVM-NEXT: PhysicalAddress: 0x1000 # ELF-LLVM-NEXT: FileSize: 3 @@ -569,37 +584,42 @@ ProgramHeaders: VAddr: 0x1000 FirstSec: .foo.begin LastSec: .foo.end -## Case 19: the PT_OPENBSD_BOOTDATA segment. +## Case 19: the PT_OPENBSD_SYSCALLS segment. + - Type: 0x65a3dbe9 ## PT_OPENBSD_SYSCALLS + VAddr: 0x1000 + FirstSec: .foo.begin + LastSec: .foo.end +## Case 20: the PT_OPENBSD_BOOTDATA segment. - Type: 0x65a41be6 ## PT_OPENBSD_BOOTDATA VAddr: 0x1000 FirstSec: .foo.begin LastSec: .foo.end -## Case 20: the PT_HIOS segment. +## Case 21: the PT_HIOS segment. - Type: 0x6fffffff ## PT_HIOS VAddr: 0x1000 FirstSec: .foo.begin LastSec: .foo.end -## Case 21: the PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO segment. +## Case 22: the PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO segment. - Type: 0x70000000 ## PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO VAddr: 0x1000 FirstSec: .foo.begin LastSec: .foo.end -## Case 22: the PT_ARM_EXIDX/PT_MIPS_RTPROC segment. +## Case 23: the PT_ARM_EXIDX/PT_MIPS_RTPROC segment. - Type: 0x70000001 ## PT_ARM_EXIDX, PT_MIPS_RTPROC VAddr: 0x1000 FirstSec: .foo.begin LastSec: .foo.end -## Case 23: the PT_MIPS_OPTIONS segment. +## Case 24: the PT_MIPS_OPTIONS segment. - Type: 0x70000002 ## PT_MIPS_OPTIONS VAddr: 0x1000 FirstSec: .foo.begin LastSec: .foo.end -## Case 24: the PT_MIPS_ABIFLAGS/PT_RISCV_ATTRIBUTES segment. +## Case 25: the PT_MIPS_ABIFLAGS/PT_RISCV_ATTRIBUTES segment. - Type: 0x70000003 ## PT_MIPS_ABIFLAGS/PT_RISCV_ATTRIBUTES VAddr: 0x1000 FirstSec: .foo.begin LastSec: .foo.end -## Case 25: the PT_HIPROC segment. +## Case 26: the PT_HIPROC segment. - Type: 0x7fffffff ## PT_HIPROC VAddr: 0x1000 FirstSec: .foo.begin @@ -610,9 +630,9 @@ ProgramHeaders: # RUN: llvm-readelf --program-headers %tarm.elf | FileCheck %s --check-prefix=ARM-GNU # RUN: llvm-readobj --program-headers %tarm.elf | FileCheck %s --check-prefix=ARM-LLVM -# ARM-GNU: : 0x70000000 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ARM-GNU-NEXT: EXIDX 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# ARM-GNU-NEXT: : 0x70000002 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ARM-GNU: : 0x70000000 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ARM-GNU-NEXT: EXIDX 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# ARM-GNU-NEXT: : 0x70000002 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 # ARM-LLVM: ProgramHeader { # ARM-LLVM: Type: Unknown (0x70000000) @@ -626,10 +646,10 @@ ProgramHeaders: # RUN: llvm-readelf --program-headers %tmips.elf | FileCheck %s --check-prefix=MIPS-GNU # RUN: llvm-readobj --program-headers %tmips.elf | FileCheck %s --check-prefix=MIPS-LLVM -# MIPS-GNU: REGINFO 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# MIPS-GNU-NEXT: RTPROC 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# MIPS-GNU-NEXT: OPTIONS 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 -# MIPS-GNU-NEXT: ABIFLAGS 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# MIPS-GNU: REGINFO 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# MIPS-GNU-NEXT: RTPROC 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# MIPS-GNU-NEXT: OPTIONS 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# MIPS-GNU-NEXT: ABIFLAGS 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 # MIPS-LLVM: ProgramHeader { # MIPS-LLVM: Type: PT_MIPS_REGINFO (0x70000000) @@ -645,7 +665,7 @@ ProgramHeaders: # RUN: llvm-readelf --program-headers %triscv.elf | FileCheck %s --check-prefix=RISCV-GNU # RUN: llvm-readobj --program-headers %triscv.elf | FileCheck %s --check-prefix=RISCV-LLVM -# RISCV-GNU: ATTRIBUTES 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 +# RISCV-GNU: ATTRIBUTES 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1 # RISCV-LLVM: ProgramHeader { # RISCV-LLVM: Type: PT_RISCV_ATTRIBUTES (0x70000003) diff --git a/llvm/tools/llvm-cov/SourceCoverageView.cpp b/llvm/tools/llvm-cov/SourceCoverageView.cpp index 71edd5fec4280a..5b85d7d86bfb94 100644 --- a/llvm/tools/llvm-cov/SourceCoverageView.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageView.cpp @@ -139,7 +139,7 @@ bool SourceCoverageView::shouldRenderRegionMarkers( bool SourceCoverageView::hasSubViews() const { return !ExpansionSubViews.empty() || !InstantiationSubViews.empty() || - !BranchSubViews.empty(); + !BranchSubViews.empty() || !MCDCSubViews.empty(); } std::unique_ptr diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp index abc4c49ecae98e..b93d8cb035306b 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp @@ -246,6 +246,9 @@ tr:hover { tr:last-child { border-bottom: none; } +tr:has(> td >a:target) > td.code > pre { + background-color: #ffa; +} )"; const char *EndHeader = ""; @@ -990,15 +993,13 @@ void SourceCoverageViewHTML::renderMCDCView(raw_ostream &OS, MCDCView &MRV, std::string ColNoStr = Twine(DecisionRegion.ColumnStart).str(); std::string TargetName = "L" + LineNoStr; OS << tag("span", - a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr), - TargetName), + a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr)), "line-number") + ") to ("; LineNoStr = utostr(uint64_t(DecisionRegion.LineEnd)); ColNoStr = utostr(uint64_t(DecisionRegion.ColumnEnd)); OS << tag("span", - a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr), - TargetName), + a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr)), "line-number") + ")\n\n"; diff --git a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp index 73b7ffe16a9637..580da45ecfc0d8 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp @@ -382,7 +382,8 @@ void SourceCoverageViewText::renderMCDCView(raw_ostream &OS, MCDCView &MRV, colored_ostream(OS, raw_ostream::RED, getOptions().Colors && Record.getPercentCovered() < 100.0, /*Bold=*/false, /*BG=*/true) - << format("%0.2f", Record.getPercentCovered()) << "%\n"; + << format("%0.2f", Record.getPercentCovered()) << "%"; + OS << "\n"; renderLinePrefix(OS, ViewDepth); OS << "\n"; } diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index b2a133860197dd..769ed17ac4cbd5 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -1294,10 +1294,11 @@ class MemoryMatcher { }; static StringRef detectStubKind(const Session::MemoryRegionInfo &Stub) { - constexpr uint32_t Armv7MovWTle = 0xe300c000; - constexpr uint32_t Armv7BxR12le = 0xe12fff1c; - constexpr uint32_t Thumbv7MovWTle = 0x0c00f240; - constexpr uint16_t Thumbv7BxR12le = 0x4760; + using namespace support::endian; + auto Armv7MovWTle = byte_swap(0xe300c000); + auto Armv7BxR12le = byte_swap(0xe12fff1c); + auto Thumbv7MovWTle = byte_swap(0x0c00f240); + auto Thumbv7BxR12le = byte_swap(0x4760); MemoryMatcher M(Stub.getContent()); if (M.matchMask(Thumbv7MovWTle)) { diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp index cf6aaa1f06981f..2754d49645595d 100644 --- a/llvm/tools/llvm-mc/llvm-mc.cpp +++ b/llvm/tools/llvm-mc/llvm-mc.cpp @@ -547,11 +547,6 @@ int main(int argc, char **argv) { std::unique_ptr MAB( TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions)); auto FOut = std::make_unique(*OS); - // FIXME: Workaround for bug in formatted_raw_ostream. Color escape codes - // are (incorrectly) written directly to the unbuffered raw_ostream wrapped - // by the formatted_raw_ostream. - if (Action == AC_CDisassemble) - FOut->SetUnbuffered(); Str.reset( TheTarget->createAsmStreamer(Ctx, std::move(FOut), /*asmverbose*/ true, /*useDwarfDirectory*/ true, IP, diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp index f15307181fad61..f63e5c61e802c8 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp +++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp @@ -299,6 +299,8 @@ static const StringMap TargetMap{ // LoongArch {"elf32-loongarch", {ELF::EM_LOONGARCH, false, true}}, {"elf64-loongarch", {ELF::EM_LOONGARCH, true, true}}, + // SystemZ + {"elf64-s390", {ELF::EM_S390, true, false}}, }; static Expected diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp index 34861ee92128fd..fda99bd6d33e17 100644 --- a/llvm/tools/llvm-objdump/ELFDump.cpp +++ b/llvm/tools/llvm-objdump/ELFDump.cpp @@ -291,6 +291,9 @@ template void ELFDumper::printProgramHeaders() { case ELF::PT_OPENBSD_RANDOMIZE: outs() << "OPENBSD_RANDOMIZE "; break; + case ELF::PT_OPENBSD_SYSCALLS: + outs() << "OPENBSD_SYSCALLS "; + break; case ELF::PT_OPENBSD_WXNEEDED: outs() << "OPENBSD_WXNEEDED "; break; diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index 22b427f57658e1..7ecdd60313d065 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -2032,13 +2032,6 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, formatted_raw_ostream FOS(outs()); - // FIXME: Workaround for bug in formatted_raw_ostream. Color escape codes - // are (incorrectly) written directly to the unbuffered raw_ostream - // wrapped by the formatted_raw_ostream. - if (DisassemblyColor == ColorOutput::Enable || - DisassemblyColor == ColorOutput::Auto) - FOS.SetUnbuffered(); - std::unordered_map AllLabels; std::unordered_map> BBAddrMapLabels; if (SymbolizeOperands) { diff --git a/llvm/tools/llvm-readobj/COFFImportDumper.cpp b/llvm/tools/llvm-readobj/COFFImportDumper.cpp index 8aedc310ae3a9f..0ab2a17655653e 100644 --- a/llvm/tools/llvm-readobj/COFFImportDumper.cpp +++ b/llvm/tools/llvm-readobj/COFFImportDumper.cpp @@ -45,8 +45,14 @@ void dumpCOFFImportFile(const COFFImportFile *File, ScopedPrinter &Writer) { case COFF::IMPORT_NAME_UNDECORATE: Writer.printString("Name type", "undecorate"); break; + case COFF::IMPORT_NAME_EXPORTAS: + Writer.printString("Name type", "export as"); + break; } + if (H->getNameType() != COFF::IMPORT_ORDINAL) + Writer.printString("Export name", File->getExportName()); + for (const object::BasicSymbolRef &Sym : File->symbols()) { raw_ostream &OS = Writer.startLine(); OS << "Symbol: "; diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index f369a63add1149..387124ad53e408 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1478,6 +1478,7 @@ static StringRef segmentTypeToString(unsigned Arch, unsigned Type) { LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_RANDOMIZE); LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_WXNEEDED); LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_NOBTCFI); + LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_SYSCALLS); LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_BOOTDATA); default: return ""; diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp index 59060ac217e32f..0d3fea71aafd42 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.cpp +++ b/llvm/tools/llvm-readobj/ObjDumper.cpp @@ -14,6 +14,7 @@ #include "ObjDumper.h" #include "llvm-readobj.h" #include "llvm/Object/Archive.h" +#include "llvm/Object/Decompressor.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" @@ -142,8 +143,23 @@ getSectionRefsByNameOrIndex(const object::ObjectFile &Obj, return Ret; } +static void maybeDecompress(const object::ObjectFile &Obj, + StringRef SectionName, StringRef &SectionContent, + SmallString<0> &Out) { + Expected Decompressor = object::Decompressor::create( + SectionName, SectionContent, Obj.isLittleEndian(), Obj.is64Bit()); + if (!Decompressor) + reportWarning(Decompressor.takeError(), Obj.getFileName()); + else if (auto Err = Decompressor->resizeAndDecompress(Out)) + reportWarning(std::move(Err), Obj.getFileName()); + else + SectionContent = Out; +} + void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj, - ArrayRef Sections) { + ArrayRef Sections, + bool Decompress) { + SmallString<0> Out; bool First = true; for (object::SectionRef Section : getSectionRefsByNameOrIndex(Obj, Sections)) { @@ -156,12 +172,16 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj, StringRef SectionContent = unwrapOrError(Obj.getFileName(), Section.getContents()); + if (Decompress && Section.isCompressed()) + maybeDecompress(Obj, SectionName, SectionContent, Out); printAsStringList(SectionContent); } } void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj, - ArrayRef Sections) { + ArrayRef Sections, + bool Decompress) { + SmallString<0> Out; bool First = true; for (object::SectionRef Section : getSectionRefsByNameOrIndex(Obj, Sections)) { @@ -174,6 +194,8 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj, StringRef SectionContent = unwrapOrError(Obj.getFileName(), Section.getContents()); + if (Decompress && Section.isCompressed()) + maybeDecompress(Obj, SectionName, SectionContent, Out); const uint8_t *SecContent = SectionContent.bytes_begin(); const uint8_t *SecEnd = SecContent + SectionContent.size(); diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h index 1d679453581bc8..3958dd3a333332 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.h +++ b/llvm/tools/llvm-readobj/ObjDumper.h @@ -175,9 +175,9 @@ class ObjDumper { void printAsStringList(StringRef StringContent, size_t StringDataOffset = 0); void printSectionsAsString(const object::ObjectFile &Obj, - ArrayRef Sections); + ArrayRef Sections, bool Decompress); void printSectionsAsHex(const object::ObjectFile &Obj, - ArrayRef Sections); + ArrayRef Sections, bool Decompress); std::function WarningHandler; void reportUniqueWarning(Error Err) const; diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td index e2d93c6ec229e9..018facc278e891 100644 --- a/llvm/tools/llvm-readobj/Opts.td +++ b/llvm/tools/llvm-readobj/Opts.td @@ -20,6 +20,7 @@ def all : FF<"all", "Equivalent to setting: --file-header, --program-headers, -- def arch_specific : FF<"arch-specific", "Display architecture-specific information">; def bb_addr_map : FF<"bb-addr-map", "Display the BB address map section">; def cg_profile : FF<"cg-profile", "Display call graph profile section">; +def decompress : FF<"decompress", "Dump decompressed section content when used with -x or -p">; defm demangle : BB<"demangle", "Demangle symbol names", "Do not demangle symbol names (default)">; def dependent_libraries : FF<"dependent-libraries", "Display the dependent libraries section">; def dyn_relocations : FF<"dyn-relocations", "Display the dynamic relocation entries in the file">; @@ -139,3 +140,4 @@ def : F<"u", "Alias for --unwind">, Alias; def : F<"X", "Alias for --extra-sym-info">, Alias, Group; def : F<"V", "Alias for --version-info">, Alias, Group; def : JoinedOrSeparate<["-"], "x">, Alias, HelpText<"Alias for --hex-dump">, MetaVarName<"">; +def : F<"z", "Alias for --decompress">, Alias; diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp index f9d605d35244bf..979433d69011c3 100644 --- a/llvm/tools/llvm-readobj/llvm-readobj.cpp +++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp @@ -97,6 +97,7 @@ static bool ArchSpecificInfo; static bool BBAddrMap; bool ExpandRelocs; static bool CGProfile; +static bool Decompress; bool Demangle; static bool DependentLibraries; static bool DynRelocs; @@ -212,6 +213,7 @@ static void parseOptions(const opt::InputArgList &Args) { opts::ArchSpecificInfo = Args.hasArg(OPT_arch_specific); opts::BBAddrMap = Args.hasArg(OPT_bb_addr_map); opts::CGProfile = Args.hasArg(OPT_cg_profile); + opts::Decompress = Args.hasArg(OPT_decompress); opts::Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, false); opts::DependentLibraries = Args.hasArg(OPT_dependent_libraries); opts::DynRelocs = Args.hasArg(OPT_dyn_relocations); @@ -439,9 +441,9 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer, Dumper->printSymbols(opts::Symbols, opts::DynamicSymbols, opts::ExtraSymInfo, SymComp); if (!opts::StringDump.empty()) - Dumper->printSectionsAsString(Obj, opts::StringDump); + Dumper->printSectionsAsString(Obj, opts::StringDump, opts::Decompress); if (!opts::HexDump.empty()) - Dumper->printSectionsAsHex(Obj, opts::HexDump); + Dumper->printSectionsAsHex(Obj, opts::HexDump, opts::Decompress); if (opts::HashTable) Dumper->printHashTable(); if (opts::GnuHashTable) diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt index a47a0ec84c625c..b20ac318e768db 100644 --- a/llvm/tools/llvm-shlib/CMakeLists.txt +++ b/llvm/tools/llvm-shlib/CMakeLists.txt @@ -33,7 +33,13 @@ if(LLVM_BUILD_LLVM_DYLIB) if (LLVM_LINK_LLVM_DYLIB) set(INSTALL_WITH_TOOLCHAIN INSTALL_WITH_TOOLCHAIN) endif() - add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${INSTALL_WITH_TOOLCHAIN} ${SOURCES}) + if (WIN32) + add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${INSTALL_WITH_TOOLCHAIN} ${SOURCES}) + else() + add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB OUTPUT_NAME LLVM ${INSTALL_WITH_TOOLCHAIN} ${SOURCES}) + # Add symlink for backwards compatibility with old library name + llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} $ SHARED FULL_DEST COMPONENT LLVM) + endif() list(REMOVE_DUPLICATES LIB_NAMES) if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp index d7876b7ce87490..531360a697039c 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -1556,12 +1556,12 @@ TEST_F(AArch64GISelMITest, FewerElementsPhi) { CHECK: [[PHI0:%[0-9]+]]:_(<2 x s32>) = G_PHI [[INITVAL_E01]]:_(<2 x s32>), %bb.0, [[MIDVAL_E01]]:_(<2 x s32>), %bb.1 CHECK: [[PHI1:%[0-9]+]]:_(<2 x s32>) = G_PHI [[INITVAL_E23]]:_(<2 x s32>), %bb.0, [[MIDVAL_E23]]:_(<2 x s32>), %bb.1 CHECK: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[INITVAL_E4]]:_(s32), %bb.0, [[MIDVAL_E4]]:_(s32), %bb.1 - CHECK: [[UNMERGE0:%[0-9]+]]:_(s32), [[UNMERGE1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI0]]:_(<2 x s32>) - CHECK: [[UNMERGE2:%[0-9]+]]:_(s32), [[UNMERGE3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI1]]:_(<2 x s32>) - CHECK: [[BV:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UNMERGE0]]:_(s32), [[UNMERGE1]]:_(s32), [[UNMERGE2]]:_(s32), [[UNMERGE3]]:_(s32), [[PHI2]]:_(s32) CHECK: [[OTHER_PHI:%[0-9]+]]:_(s64) = G_PHI + CHECK: [[UNMERGE0:%[0-9]+]]:_(s32), [[UNMERGE1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI0]]:_(<2 x s32>) + CHECK: [[UNMERGE2:%[0-9]+]]:_(s32), [[UNMERGE3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI1]]:_(<2 x s32>) + CHECK: [[BV:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UNMERGE0]]:_(s32), [[UNMERGE1]]:_(s32), [[UNMERGE2]]:_(s32), [[UNMERGE3]]:_(s32), [[PHI2]]:_(s32) CHECK: [[USE_OP:%[0-9]+]]:_(<5 x s32>) = G_AND [[BV]]:_, [[BV]]:_ )"; diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index e505af5d3275ef..f60bb4c135bec3 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -2479,6 +2479,24 @@ TEST_F(ConstantRangeTest, castOps) { ConstantRange IntToPtr = A.castOp(Instruction::IntToPtr, 64); EXPECT_EQ(64u, IntToPtr.getBitWidth()); EXPECT_TRUE(IntToPtr.isFullSet()); + + ConstantRange UIToFP = A.castOp(Instruction::UIToFP, 16); + EXPECT_EQ(16u, UIToFP.getBitWidth()); + EXPECT_TRUE(UIToFP.isFullSet()); + + ConstantRange UIToFP2 = A.castOp(Instruction::UIToFP, 64); + ConstantRange B(APInt(64, 0), APInt(64, 65536)); + EXPECT_EQ(64u, UIToFP2.getBitWidth()); + EXPECT_EQ(B, UIToFP2); + + ConstantRange SIToFP = A.castOp(Instruction::SIToFP, 16); + EXPECT_EQ(16u, SIToFP.getBitWidth()); + EXPECT_TRUE(SIToFP.isFullSet()); + + ConstantRange SIToFP2 = A.castOp(Instruction::SIToFP, 64); + ConstantRange C(APInt(64, -32768), APInt(64, 32768)); + EXPECT_EQ(64u, SIToFP2.getBitWidth()); + EXPECT_EQ(C, SIToFP2); } TEST_F(ConstantRangeTest, binaryAnd) { diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp index 23f66a0232ddbb..2849781a9dc43b 100644 --- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp +++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp @@ -890,6 +890,42 @@ TEST_P(CoverageMappingTest, non_code_region_bitmask) { ASSERT_EQ(1U, Names.size()); } +// Test the order of MCDCDecision before Expansion +TEST_P(CoverageMappingTest, decision_before_expansion) { + startFunction("foo", 0x1234); + addCMR(Counter::getCounter(0), "foo", 3, 23, 5, 2); + + // This(4:11) was put after Expansion(4:11) before the fix + addMCDCDecisionCMR(0, 2, "foo", 4, 11, 4, 20); + + addExpansionCMR("foo", "A", 4, 11, 4, 12); + addExpansionCMR("foo", "B", 4, 19, 4, 20); + addCMR(Counter::getCounter(0), "A", 1, 14, 1, 17); + addCMR(Counter::getCounter(0), "A", 1, 14, 1, 17); + addMCDCBranchCMR(Counter::getCounter(0), Counter::getCounter(1), 1, 2, 0, "A", + 1, 14, 1, 17); + addCMR(Counter::getCounter(1), "B", 1, 14, 1, 17); + addMCDCBranchCMR(Counter::getCounter(1), Counter::getCounter(2), 2, 0, 0, "B", + 1, 14, 1, 17); + + // InputFunctionCoverageData::Regions is rewritten after the write. + auto InputRegions = InputFunctions.back().Regions; + + writeAndReadCoverageRegions(); + + const auto &OutputRegions = OutputFunctions.back().Regions; + + size_t N = ArrayRef(InputRegions).size(); + ASSERT_EQ(N, OutputRegions.size()); + for (size_t I = 0; I < N; ++I) { + ASSERT_EQ(InputRegions[I].Kind, OutputRegions[I].Kind); + ASSERT_EQ(InputRegions[I].FileID, OutputRegions[I].FileID); + ASSERT_EQ(InputRegions[I].ExpandedFileID, OutputRegions[I].ExpandedFileID); + ASSERT_EQ(InputRegions[I].startLoc(), OutputRegions[I].startLoc()); + ASSERT_EQ(InputRegions[I].endLoc(), OutputRegions[I].endLoc()); + } +} + TEST_P(CoverageMappingTest, strip_filename_prefix) { ProfileWriter.addRecord({"file1:func", 0x1234, {0}}, Err); diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp index 9b7112fa2bfeb5..24ed3e2e4b0561 100644 --- a/llvm/unittests/Support/RISCVISAInfoTest.cpp +++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp @@ -366,55 +366,51 @@ TEST(ParseArchString, RejectsDuplicateExtensionNames) { TEST(ParseArchString, RejectsExperimentalExtensionsIfNotEnableExperimentalExtension) { EXPECT_EQ( - toString( - RISCVISAInfo::parseArchString("rv64izicond", false).takeError()), + toString(RISCVISAInfo::parseArchString("rv64iztso", false).takeError()), "requires '-menable-experimental-extensions' for experimental extension " - "'zicond'"); + "'ztso'"); } TEST(ParseArchString, AcceptsExperimentalExtensionsIfEnableExperimentalExtension) { - // Note: If zicond becomes none-experimental, this test will need + // Note: If ztso becomes none-experimental, this test will need // updating (and unfortunately, it will still pass). The failure of // RejectsExperimentalExtensionsIfNotEnableExperimentalExtension will // hopefully serve as a reminder to update. - auto MaybeISAInfo = - RISCVISAInfo::parseArchString("rv64izicond", true, false); + auto MaybeISAInfo = RISCVISAInfo::parseArchString("rv64iztso", true, false); ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded()); RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions(); EXPECT_EQ(Exts.size(), 2UL); - EXPECT_EQ(Exts.count("zicond"), 1U); - auto MaybeISAInfo2 = RISCVISAInfo::parseArchString("rv64izicond1p0", true); + EXPECT_EQ(Exts.count("ztso"), 1U); + auto MaybeISAInfo2 = RISCVISAInfo::parseArchString("rv64iztso0p1", true); ASSERT_THAT_EXPECTED(MaybeISAInfo2, Succeeded()); RISCVISAInfo::OrderedExtensionMap Exts2 = (*MaybeISAInfo2)->getExtensions(); EXPECT_EQ(Exts2.size(), 2UL); - EXPECT_EQ(Exts2.count("zicond"), 1U); + EXPECT_EQ(Exts2.count("ztso"), 1U); } TEST(ParseArchString, RequiresExplicitVersionNumberForExperimentalExtensionByDefault) { EXPECT_EQ( - toString( - RISCVISAInfo::parseArchString("rv64izicond", true).takeError()), - "experimental extension requires explicit version number `zicond`"); + toString(RISCVISAInfo::parseArchString("rv64iztso", true).takeError()), + "experimental extension requires explicit version number `ztso`"); } TEST(ParseArchString, AcceptsUnrecognizedVersionIfNotExperimentalExtensionVersionCheck) { auto MaybeISAInfo = - RISCVISAInfo::parseArchString("rv64izicond9p9", true, false); + RISCVISAInfo::parseArchString("rv64iztso9p9", true, false); ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded()); RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions(); EXPECT_EQ(Exts.size(), 2UL); - EXPECT_TRUE(Exts.at("zicond") == (RISCVISAInfo::ExtensionVersion{9, 9})); + EXPECT_TRUE(Exts.at("ztso") == (RISCVISAInfo::ExtensionVersion{9, 9})); } TEST(ParseArchString, RejectsUnrecognizedVersionForExperimentalExtension) { EXPECT_EQ( - toString( - RISCVISAInfo::parseArchString("rv64izicond9p9", true).takeError()), - "unsupported version number 9.9 for experimental extension 'zicond' " - "(this compiler supports 1.0)"); + toString(RISCVISAInfo::parseArchString("rv64iztso9p9", true).takeError()), + "unsupported version number 9.9 for experimental extension 'ztso' " + "(this compiler supports 0.1)"); } TEST(ParseArchString, RejectsExtensionVersionForG) { @@ -489,16 +485,16 @@ TEST(ParseArchString, RejectsConflictingExtensions) { TEST(ToFeatures, IIsDroppedAndExperimentalExtensionsArePrefixed) { auto MaybeISAInfo1 = - RISCVISAInfo::parseArchString("rv64im_zicond", true, false); + RISCVISAInfo::parseArchString("rv64im_ztso", true, false); ASSERT_THAT_EXPECTED(MaybeISAInfo1, Succeeded()); EXPECT_THAT((*MaybeISAInfo1)->toFeatures(), - ElementsAre("+m", "+experimental-zicond")); + ElementsAre("+m", "+experimental-ztso")); - auto MaybeISAInfo2 = RISCVISAInfo::parseArchString( - "rv32e_zicond_xventanacondops", true, false); + auto MaybeISAInfo2 = + RISCVISAInfo::parseArchString("rv32e_ztso_xventanacondops", true, false); ASSERT_THAT_EXPECTED(MaybeISAInfo2, Succeeded()); EXPECT_THAT((*MaybeISAInfo2)->toFeatures(), - ElementsAre("+e", "+experimental-zicond", "+xventanacondops")); + ElementsAre("+e", "+experimental-ztso", "+xventanacondops")); } TEST(ToFeatures, UnsupportedExtensionsAreDropped) { @@ -649,10 +645,10 @@ TEST(isSupportedExtensionWithVersion, AcceptsSingleExtensionWithVersion) { TEST(getTargetFeatureForExtension, RetrieveTargetFeatureFromOneExt) { EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zbb"), "zbb"); - EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zicond1p0"), - "experimental-zicond"); - EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zicond"), - "experimental-zicond"); + EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("ztso0p1"), + "experimental-ztso"); + EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("ztso"), + "experimental-ztso"); EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zihintntl1234p4321"), ""); EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zfoo"), ""); @@ -684,6 +680,7 @@ R"(All available -march extensions for RISC-V zicclsm 1.0 ziccrse 1.0 zicntr 2.0 + zicond 1.0 zicsr 2.0 zifencei 2.0 zihintntl 1.0 @@ -793,7 +790,6 @@ R"(All available -march extensions for RISC-V Experimental extensions zicfilp 0.4 This is a long dummy description zicfiss 0.4 - zicond 1.0 zimop 0.1 zacas 1.0 zfbfmin 1.0 diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp index 5f151616d7ca6a..6aa1d7a087ebf0 100644 --- a/llvm/unittests/TargetParser/Host.cpp +++ b/llvm/unittests/TargetParser/Host.cpp @@ -122,6 +122,9 @@ TEST(getLinuxHostCPUName, AArch64) { EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n" "CPU part : 0xac4"), "ampere1a"); + EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n" + "CPU part : 0xac5"), + "ampere1b"); // MSM8992/4 weirdness StringRef MSM8992ProcCpuInfo = R"( diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index d31fcd1bb1b00d..131741ff7fd07a 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -133,8 +133,8 @@ template struct AssertSameExtensionFlags { return testing::AssertionFailure() << llvm::formatv( "CPU: {4}\n" - "Expected extension flags: {0} ({1:x})\n" - " Got extension flags: {2} ({3:x})\n", + "Expected extension flags: {0} ({1})\n" + " Got extension flags: {2} ({3})\n", FormatExtensionFlags(ExpectedFlags), SerializeExtensionFlags(ExpectedFlags), FormatExtensionFlags(GotFlags), @@ -1260,7 +1260,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_SHA3, AArch64::AEK_SM4, AArch64::AEK_FP16, AArch64::AEK_BF16, AArch64::AEK_PROFILE, AArch64::AEK_RAND, AArch64::AEK_FP16FML, - AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.4-A"), ARMCPUTestParams( "neoverse-v2", "armv9-a", "neon-fp-armv8", @@ -1275,7 +1276,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_SVE2, AArch64::AEK_PROFILE, AArch64::AEK_FP16FML, AArch64::AEK_I8MM, AArch64::AEK_SVE2BITPERM, AArch64::AEK_RAND, - AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "9-A"), ARMCPUTestParams( "cortex-r82", "armv8-r", "crypto-neon-fp-armv8", @@ -1284,7 +1286,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_DOTPROD, AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_RAS, AArch64::AEK_RCPC, AArch64::AEK_LSE, AArch64::AEK_SB, - AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH})), "8-R"), ARMCPUTestParams( "cortex-x1", "armv8.2-a", "crypto-neon-fp-armv8", @@ -1389,7 +1391,8 @@ INSTANTIATE_TEST_SUITE_P( {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, - AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.3-A"), ARMCPUTestParams( "apple-a13", "armv8.4-a", "crypto-neon-fp-armv8", @@ -1399,7 +1402,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT, - AArch64::AEK_FCMA})), + AArch64::AEK_FCMA, AArch64::AEK_PAUTH})), "8.4-A"), ARMCPUTestParams( "apple-a14", "armv8.5-a", "crypto-neon-fp-armv8", @@ -1409,7 +1412,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT, - AArch64::AEK_FCMA})), + AArch64::AEK_FCMA, AArch64::AEK_PAUTH})), "8.5-A"), ARMCPUTestParams( "apple-a15", "armv8.6-a", "crypto-neon-fp-armv8", @@ -1419,7 +1422,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16, - AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.6-A"), ARMCPUTestParams( "apple-a16", "armv8.6-a", "crypto-neon-fp-armv8", @@ -1429,7 +1433,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16, - AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.6-A"), ARMCPUTestParams( "apple-a17", "armv8.6-a", "crypto-neon-fp-armv8", @@ -1439,7 +1444,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16, - AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.6-A"), ARMCPUTestParams( "apple-m1", "armv8.5-a", "crypto-neon-fp-armv8", @@ -1449,7 +1455,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT, - AArch64::AEK_FCMA})), + AArch64::AEK_FCMA, AArch64::AEK_PAUTH})), "8.5-A"), ARMCPUTestParams( "apple-m2", "armv8.6-a", "crypto-neon-fp-armv8", @@ -1459,7 +1465,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16, - AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.6-A"), ARMCPUTestParams( "apple-m3", "armv8.6-a", "crypto-neon-fp-armv8", @@ -1469,7 +1476,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16, - AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.6-A"), ARMCPUTestParams( "apple-s4", "armv8.3-a", "crypto-neon-fp-armv8", @@ -1477,7 +1485,8 @@ INSTANTIATE_TEST_SUITE_P( {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, - AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.3-A"), ARMCPUTestParams( "apple-s5", "armv8.3-a", "crypto-neon-fp-armv8", @@ -1485,7 +1494,8 @@ INSTANTIATE_TEST_SUITE_P( {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC, - AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.3-A"), ARMCPUTestParams( "exynos-m3", "armv8-a", "crypto-neon-fp-armv8", @@ -1550,7 +1560,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_SB, AArch64::AEK_SVE2, AArch64::AEK_SVE2BITPERM, AArch64::AEK_BF16, AArch64::AEK_I8MM, AArch64::AEK_JSCVT, - AArch64::AEK_FCMA})), + AArch64::AEK_FCMA, AArch64::AEK_PAUTH})), "8.5-A"), ARMCPUTestParams( "ampere1", "armv8.6-a", "crypto-neon-fp-armv8", @@ -1561,7 +1571,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_SHA3, AArch64::AEK_BF16, AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_I8MM, AArch64::AEK_SSBS, AArch64::AEK_SB, AArch64::AEK_RAND, AArch64::AEK_JSCVT, - AArch64::AEK_FCMA})), + AArch64::AEK_FCMA, AArch64::AEK_PAUTH})), "8.6-A"), ARMCPUTestParams( "ampere1a", "armv8.6-a", "crypto-neon-fp-armv8", @@ -1572,8 +1582,21 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_SM4, AArch64::AEK_SHA3, AArch64::AEK_BF16, AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_I8MM, AArch64::AEK_SSBS, AArch64::AEK_SB, AArch64::AEK_RAND, - AArch64::AEK_MTE, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_MTE, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.6-A"), + ARMCPUTestParams( + "ampere1b", "armv8.7-a", "crypto-neon-fp-armv8", + (AArch64::ExtensionBitset( + {AArch64::AEK_CRC, AArch64::AEK_FP, AArch64::AEK_FP16, + AArch64::AEK_SIMD, AArch64::AEK_RAS, AArch64::AEK_LSE, + AArch64::AEK_RDM, AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, + AArch64::AEK_SM4, AArch64::AEK_SHA3, AArch64::AEK_BF16, + AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_I8MM, + AArch64::AEK_SSBS, AArch64::AEK_SB, AArch64::AEK_RAND, + AArch64::AEK_MTE, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH, AArch64::AEK_CSSC})), + "8.7-A"), ARMCPUTestParams( "neoverse-512tvb", "armv8.4-a", "crypto-neon-fp-armv8", (AArch64::ExtensionBitset( @@ -1584,7 +1607,8 @@ INSTANTIATE_TEST_SUITE_P( AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_SHA3, AArch64::AEK_SM4, AArch64::AEK_FP16, AArch64::AEK_BF16, AArch64::AEK_PROFILE, AArch64::AEK_RAND, AArch64::AEK_FP16FML, - AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, + AArch64::AEK_PAUTH})), "8.4-A"), ARMCPUTestParams( "thunderx2t99", "armv8.1-a", "crypto-neon-fp-armv8", @@ -1599,7 +1623,7 @@ INSTANTIATE_TEST_SUITE_P( {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_LSE, AArch64::AEK_RDM, AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_RAS, AArch64::AEK_RCPC, - AArch64::AEK_JSCVT, AArch64::AEK_FCMA})), + AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH})), "8.3-A"), ARMCPUTestParams( "thunderx", "armv8-a", "crypto-neon-fp-armv8", @@ -1651,7 +1675,7 @@ INSTANTIATE_TEST_SUITE_P( "8.2-A"))); // Note: number of CPUs includes aliases. -static constexpr unsigned NumAArch64CPUArchs = 67; +static constexpr unsigned NumAArch64CPUArchs = 69; TEST(TargetParserTest, testAArch64CPUArchList) { SmallVector List; diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp index 474042a3e9a331..db3c4decccb4cf 100644 --- a/llvm/utils/TableGen/Attributes.cpp +++ b/llvm/utils/TableGen/Attributes.cpp @@ -87,7 +87,11 @@ void Attributes::emitFnAttrCompatCheck(raw_ostream &OS, bool IsStringAttr) { for (auto *Rule : CompatRules) { StringRef FuncName = Rule->getValueAsString("CompatFunc"); - OS << " Ret &= " << FuncName << "(Caller, Callee);\n"; + OS << " Ret &= " << FuncName << "(Caller, Callee"; + StringRef AttrName = Rule->getValueAsString("AttrName"); + if (!AttrName.empty()) + OS << ", \"" << AttrName << "\""; + OS << ");\n"; } OS << "\n"; diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp index 455183987b7b27..50156d34528c15 100644 --- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -57,7 +57,8 @@ class MatcherTableEmitter { // We de-duplicate the predicates by code string, and use this map to track // all the patterns with "identical" predicates. - StringMap> NodePredicatesByCodeToRun; + MapVector, StringMap> + NodePredicatesByCodeToRun; std::vector PatternPredicates; diff --git a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp index 78dcd4471ae747..7f494e532b1f44 100644 --- a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp +++ b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp @@ -152,8 +152,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate, << "if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))\n"; OS.indent(4) << " return false;\n"; OS.indent(2) << "}\n"; - } else if (Predicate->isSubClassOf( - "FirstFusionPredicateWithMCInstPredicate")) { + } else if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) { OS.indent(2) << "{\n"; OS.indent(4) << "const MachineInstr *MI = FirstMI;\n"; OS.indent(4) << "if ("; @@ -173,7 +172,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate, void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate, PredicateExpander &PE, raw_ostream &OS) { - if (Predicate->isSubClassOf("SecondFusionPredicateWithMCInstPredicate")) { + if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) { OS.indent(2) << "{\n"; OS.indent(4) << "const MachineInstr *MI = &SecondMI;\n"; OS.indent(4) << "if ("; @@ -185,7 +184,7 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate, OS.indent(2) << "}\n"; } else { PrintFatalError(Predicate->getLoc(), - "Unsupported predicate for first instruction: " + + "Unsupported predicate for second instruction: " + Predicate->getType()->getAsString()); } } @@ -196,9 +195,8 @@ void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate, if (Predicate->isSubClassOf("FusionPredicateWithCode")) OS << Predicate->getValueAsString("Predicate"); else if (Predicate->isSubClassOf("BothFusionPredicateWithMCInstPredicate")) { - Record *MCPred = Predicate->getValueAsDef("Predicate"); - emitFirstPredicate(MCPred, PE, OS); - emitSecondPredicate(MCPred, PE, OS); + emitFirstPredicate(Predicate, PE, OS); + emitSecondPredicate(Predicate, PE, OS); } else if (Predicate->isSubClassOf("TieReg")) { int FirstOpIdx = Predicate->getValueAsInt("FirstOpIdx"); int SecondOpIdx = Predicate->getValueAsInt("SecondOpIdx"); diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp index d3a73e02cd916f..0b9b6389fe3817 100644 --- a/llvm/utils/TableGen/PredicateExpander.cpp +++ b/llvm/utils/TableGen/PredicateExpander.cpp @@ -59,6 +59,30 @@ void PredicateExpander::expandCheckImmOperandSimple(raw_ostream &OS, OS << ")"; } +void PredicateExpander::expandCheckImmOperandLT(raw_ostream &OS, int OpIndex, + int ImmVal, + StringRef FunctionMapper) { + if (!FunctionMapper.empty()) + OS << FunctionMapper << "("; + OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex + << ").getImm()"; + if (!FunctionMapper.empty()) + OS << ")"; + OS << (shouldNegate() ? " >= " : " < ") << ImmVal; +} + +void PredicateExpander::expandCheckImmOperandGT(raw_ostream &OS, int OpIndex, + int ImmVal, + StringRef FunctionMapper) { + if (!FunctionMapper.empty()) + OS << FunctionMapper << "("; + OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex + << ").getImm()"; + if (!FunctionMapper.empty()) + OS << ")"; + OS << (shouldNegate() ? " <= " : " > ") << ImmVal; +} + void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg, StringRef FunctionMapper) { @@ -352,6 +376,16 @@ void PredicateExpander::expandPredicate(raw_ostream &OS, const Record *Rec) { Rec->getValueAsString("ImmVal"), Rec->getValueAsString("FunctionMapper")); + if (Rec->isSubClassOf("CheckImmOperandLT")) + return expandCheckImmOperandLT(OS, Rec->getValueAsInt("OpIndex"), + Rec->getValueAsInt("ImmVal"), + Rec->getValueAsString("FunctionMapper")); + + if (Rec->isSubClassOf("CheckImmOperandGT")) + return expandCheckImmOperandGT(OS, Rec->getValueAsInt("OpIndex"), + Rec->getValueAsInt("ImmVal"), + Rec->getValueAsString("FunctionMapper")); + if (Rec->isSubClassOf("CheckImmOperandSimple")) return expandCheckImmOperandSimple(OS, Rec->getValueAsInt("OpIndex"), Rec->getValueAsString("FunctionMapper")); diff --git a/llvm/utils/TableGen/PredicateExpander.h b/llvm/utils/TableGen/PredicateExpander.h index cfb0a3d51e6776..a0dc6302397883 100644 --- a/llvm/utils/TableGen/PredicateExpander.h +++ b/llvm/utils/TableGen/PredicateExpander.h @@ -61,6 +61,10 @@ class PredicateExpander { StringRef FunctionMapperer); void expandCheckImmOperandSimple(raw_ostream &OS, int OpIndex, StringRef FunctionMapper); + void expandCheckImmOperandLT(raw_ostream &OS, int OpIndex, int ImmVal, + StringRef FunctionMapper); + void expandCheckImmOperandGT(raw_ostream &OS, int OpIndex, int ImmVal, + StringRef FunctionMapper); void expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg, StringRef FunctionMapper); void expandCheckRegOperandSimple(raw_ostream &OS, int OpIndex, diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp index 8a860d0945bb1a..7ea02ecba324cb 100644 --- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp @@ -14,6 +14,7 @@ #include "CodeGenInstruction.h" #include "CodeGenTarget.h" #include "X86RecognizableInstr.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/X86FoldTablesUtils.h" #include "llvm/TableGen/Record.h" @@ -80,6 +81,7 @@ class X86FoldTablesEmitter { bool FoldStore = false; enum BcastType { BCAST_NONE, + BCAST_W, BCAST_D, BCAST_Q, BCAST_SS, @@ -114,6 +116,9 @@ class X86FoldTablesEmitter { switch (BroadcastKind) { case BCAST_NONE: break; + case BCAST_W: + Attrs += "TB_BCAST_W|"; + break; case BCAST_D: Attrs += "TB_BCAST_D|"; break; @@ -529,45 +534,22 @@ void X86FoldTablesEmitter::addBroadcastEntry( assert(Table.find(RegInst) == Table.end() && "Override entry unexpectedly"); X86FoldTableEntry Result = X86FoldTableEntry(RegInst, MemInst); - Record *RegRec = RegInst->TheDef; - StringRef RegInstName = RegRec->getName(); - StringRef MemInstName = MemInst->TheDef->getName(); - Record *Domain = RegRec->getValueAsDef("ExeDomain"); - bool IsSSEPackedInt = Domain->getName() == "SSEPackedInt"; - if ((RegInstName.contains("DZ") || RegInstName.contains("DWZ") || - RegInstName.contains("Dr") || RegInstName.contains("I32")) && - IsSSEPackedInt) { - assert((MemInstName.contains("DZ") || RegInstName.contains("DWZ") || - MemInstName.contains("Dr") || MemInstName.contains("I32")) && - "Unmatched names for broadcast"); - Result.BroadcastKind = X86FoldTableEntry::BCAST_D; - } else if ((RegInstName.contains("QZ") || RegInstName.contains("QBZ") || - RegInstName.contains("Qr") || RegInstName.contains("I64")) && - IsSSEPackedInt) { - assert((MemInstName.contains("QZ") || MemInstName.contains("QBZ") || - MemInstName.contains("Qr") || MemInstName.contains("I64")) && - "Unmatched names for broadcast"); - Result.BroadcastKind = X86FoldTableEntry::BCAST_Q; - } else if ((RegInstName.contains("PS") || RegInstName.contains("F32") || - RegInstName.contains("CPH")) && - !RegInstName.contains("PH2PS")) { - assert((MemInstName.contains("PS") || MemInstName.contains("F32") || - MemInstName.contains("CPH")) && - "Unmatched names for broadcast"); - Result.BroadcastKind = X86FoldTableEntry::BCAST_SS; - } else if ((RegInstName.contains("PD") || RegInstName.contains("F64")) && - !RegInstName.contains("PH2PD")) { - assert((MemInstName.contains("PD") || MemInstName.contains("F64")) && - "Unmatched names for broadcast"); - Result.BroadcastKind = X86FoldTableEntry::BCAST_SD; - } else if (RegInstName.contains("PH")) { - assert(MemInstName.contains("PH") && "Unmatched names for broadcast"); - Result.BroadcastKind = X86FoldTableEntry::BCAST_SH; - } else { - errs() << RegInstName << ", " << MemInstName << "\n"; - llvm_unreachable("Name is not canoicalized for broadcast or " - "ExeDomain is incorrect"); + DagInit *In = MemInst->TheDef->getValueAsDag("InOperandList"); + for (unsigned I = 0, E = In->getNumArgs(); I != E; ++I) { + Result.BroadcastKind = + StringSwitch(In->getArg(I)->getAsString()) + .Case("i16mem", X86FoldTableEntry::BCAST_W) + .Case("i32mem", X86FoldTableEntry::BCAST_D) + .Case("i64mem", X86FoldTableEntry::BCAST_Q) + .Case("f16mem", X86FoldTableEntry::BCAST_SH) + .Case("f32mem", X86FoldTableEntry::BCAST_SS) + .Case("f64mem", X86FoldTableEntry::BCAST_SD) + .Default(X86FoldTableEntry::BCAST_NONE); + if (Result.BroadcastKind != X86FoldTableEntry::BCAST_NONE) + break; } + assert(Result.BroadcastKind != X86FoldTableEntry::BCAST_NONE && + "Unknown memory operand for broadcast"); Table[RegInst] = Result; } diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni index f1137591766125..e55c1ed3d7a595 100644 --- a/llvm/utils/gn/secondary/llvm/version.gni +++ b/llvm/utils/gn/secondary/llvm/version.gni @@ -1,4 +1,4 @@ llvm_version_major = 18 -llvm_version_minor = 0 +llvm_version_minor = 1 llvm_version_patch = 0 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch" diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py index 1eea0887f1d119..1cfcc7d37813bc 100644 --- a/llvm/utils/lit/lit/__init__.py +++ b/llvm/utils/lit/lit/__init__.py @@ -2,7 +2,7 @@ __author__ = "Daniel Dunbar" __email__ = "daniel@minormatter.com" -__versioninfo__ = (18, 0, 0) +__versioninfo__ = (18, 1, 5) __version__ = ".".join(str(v) for v in __versioninfo__) + "dev" __all__ = [] diff --git a/llvm/utils/release/github-upload-release.py b/llvm/utils/release/github-upload-release.py index a8bb569d2fc999..8343dee937f78f 100755 --- a/llvm/utils/release/github-upload-release.py +++ b/llvm/utils/release/github-upload-release.py @@ -77,20 +77,28 @@ def upload_files(repo, release, files): parser.add_argument("--token", type=str) parser.add_argument("--release", type=str) parser.add_argument("--user", type=str) +parser.add_argument("--user-token", type=str) # Upload args parser.add_argument("--files", nargs="+", type=str) args = parser.parse_args() -github = github.Github(args.token) -llvm_org = github.get_organization("llvm") +gh = github.Github(args.token) +llvm_org = gh.get_organization("llvm") llvm_repo = llvm_org.get_repo("llvm-project") if args.user: + if not args.user_token: + print("--user-token option required when --user is used") + sys.exit(1) # Validate that this user is allowed to modify releases. - user = github.get_user(args.user) - team = llvm_org.get_team_by_slug("llvm-release-managers") + user = gh.get_user(args.user) + team = ( + github.Github(args.user_token) + .get_organization("llvm") + .get_team_by_slug("llvm-release-managers") + ) if not team.has_in_members(user): print("User {} is not a allowed to modify releases".format(args.user)) sys.exit(1) @@ -99,6 +107,6 @@ def upload_files(repo, release, files): sys.exit(1) if args.command == "create": - create_release(llvm_repo, args.release, args.user) + create_release(llvm_repo, args.release) if args.command == "upload": upload_files(llvm_repo, args.release, args.files) diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh index 5b1945df47d24a..4314b565e11b03 100755 --- a/llvm/utils/release/test-release.sh +++ b/llvm/utils/release/test-release.sh @@ -532,11 +532,16 @@ function build_llvmCore() { BuildTarget="clang" InstallTarget="install-clang install-clang-resource-headers" # compiler-rt builtins is needed on AIX to have a functional Phase 1 clang. - if [ "$System" = "AIX" -o "$Phase" != "1" ]; then + if [ "$System" = "AIX" ]; then BuildTarget="$BuildTarget runtimes" - InstallTarget="$InstallTarget install-runtimes" + InstallTarget="$InstallTarget install-builtins" fi fi + if [ "$Phase" -eq "3" ]; then + # Build everything at once, with the proper parallelism and verbosity, + # in Phase 3. + BuildTarget= + fi cd $ObjDir echo "# Compiling llvm $Release-$RC $Flavor" diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 516a984399ff81..638e46a2f9c752 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -253,22 +253,23 @@ def ROCDL_mfma_f32_32x32x16_fp8_fp8 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.fp8.f //===---------------------------------------------------------------------===// // WMMA intrinsics -class ROCDL_Wmma_IntrOp traits = []> : +class ROCDL_Wmma_IntrOp overloadedOperands, + list traits = []> : LLVM_IntrOpBase, + [0], overloadedOperands, traits, 1>, Arguments<(ins Variadic:$args)> { let assemblyFormat = "$args attr-dict `:` functional-type($args, $res)"; } // Available on RDNA3 -def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">; -def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">; -def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">; -def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">; -def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">; -def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">; +def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>; +def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>; +def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>; +def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>; +def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>; +def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>; //===---------------------------------------------------------------------===// // Operations on raw buffer resources (stride of 0, bounds checks either off or in diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h index 45f29f37dd3b97..50f6f6de5c2897 100644 --- a/mlir/include/mlir/IR/Dialect.h +++ b/mlir/include/mlir/IR/Dialect.h @@ -281,7 +281,11 @@ class Dialect { /// Register a set of type classes with this dialect. template void addTypes() { - (addType(), ...); + // This initializer_list argument pack expansion is essentially equal to + // using a fold expression with a comma operator. Clang however, refuses + // to compile a fold expression with a depth of more than 256 by default. + // There seem to be no such limitations for initializer_list. + (void)std::initializer_list{0, (addType(), 0)...}; } /// Register a type instance with this dialect. @@ -292,7 +296,11 @@ class Dialect { /// Register a set of attribute classes with this dialect. template void addAttributes() { - (addAttribute(), ...); + // This initializer_list argument pack expansion is essentially equal to + // using a fold expression with a comma operator. Clang however, refuses + // to compile a fold expression with a depth of more than 256 by default. + // There seem to be no such limitations for initializer_list. + (void)std::initializer_list{0, (addAttribute(), 0)...}; } /// Register an attribute instance with this dialect. diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index ae2bd8e5b5405d..73d418cb841327 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -529,7 +529,8 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite( /*alignment=*/0); for (auto [index, arg] : llvm::enumerate(args)) { Value ptr = rewriter.create( - loc, ptrType, structType, tempAlloc, ArrayRef{0, index}); + loc, ptrType, structType, tempAlloc, + ArrayRef{0, static_cast(index)}); rewriter.create(loc, arg, ptr); } std::array printfArgs = {stringStart, tempAlloc}; diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index f853d5c47b623c..78d4e806246872 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -1041,13 +1041,14 @@ Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( auto arrayPtr = builder.create( loc, llvmPointerType, llvmPointerType, arraySize, /*alignment=*/0); for (const auto &en : llvm::enumerate(arguments)) { + const auto index = static_cast(en.index()); Value fieldPtr = builder.create(loc, llvmPointerType, structType, structPtr, - ArrayRef{0, en.index()}); + ArrayRef{0, index}); builder.create(loc, en.value(), fieldPtr); - auto elementPtr = builder.create( - loc, llvmPointerType, llvmPointerType, arrayPtr, - ArrayRef{en.index()}); + auto elementPtr = + builder.create(loc, llvmPointerType, llvmPointerType, + arrayPtr, ArrayRef{index}); builder.create(loc, fieldPtr, elementPtr); } return arrayPtr; diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp index 72f9295749a66b..b25c831bc7172a 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp @@ -488,7 +488,8 @@ static void splitVectorStore(const DataLayout &dataLayout, Location loc, // Other patterns will turn this into a type-consistent GEP. auto gepOp = rewriter.create( loc, address.getType(), rewriter.getI8Type(), address, - ArrayRef{storeOffset + index * elementSize}); + ArrayRef{ + static_cast(storeOffset + index * elementSize)}); rewriter.create(loc, extractOp, gepOp); } @@ -524,9 +525,9 @@ static void splitIntegerStore(const DataLayout &dataLayout, Location loc, // We create an `i8` indexed GEP here as that is the easiest (offset is // already known). Other patterns turn this into a type-consistent GEP. - auto gepOp = - rewriter.create(loc, address.getType(), rewriter.getI8Type(), - address, ArrayRef{currentOffset}); + auto gepOp = rewriter.create( + loc, address.getType(), rewriter.getI8Type(), address, + ArrayRef{static_cast(currentOffset)}); rewriter.create(loc, valueToStore, gepOp); // No need to care about padding here since we already checked previously diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 140bdd1f2db361..be875297fc93ca 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -2092,6 +2092,7 @@ DiagnosedSilenceableFailure transform::ConvertToLoopsOp::applyToOne( scf::lowerToLoopsUsingSCFForOp(rewriter, target); if (failed(loops)) return emitDefaultDefiniteFailure(target); + rewriter.eraseOp(target); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir index 7969de0d456bb6..1b2c553b25ded0 100644 --- a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir @@ -33,6 +33,7 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[MULF:.+]] = arith.mulf %[[LHS]], %[[RHS]] // CHECK: %[[ADDF:.+]] = arith.addf %[[OUT]], %[[MULF]] // CHECK: memref.store %[[ADDF]], %[[ARG2]][%[[IV0]], %[[IV1]]] +// CHECK-NOT: linalg.matmul ins(%arg0, %arg1 : memref, memref) // ----- diff --git a/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir b/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir new file mode 100644 index 00000000000000..f8d082082117cb --- /dev/null +++ b/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir @@ -0,0 +1,27 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +// Decoding the attribute does not work on big-endian platforms currently +// XFAIL: target=s390x-{{.*}} + +// CHECK{LITERAL}: @dense_resource_tensor_constant = internal constant [5 x float] [float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000] +llvm.mlir.global internal constant @dense_resource_tensor_constant(dense_resource : tensor<5xf32>) : !llvm.array<5 x f32> + +// CHECK{LITERAL}: @dense_resource_vector_constant = internal constant <5 x float> +llvm.mlir.global internal constant @dense_resource_vector_constant(dense_resource : vector<5xf32>) : vector<5xf32> + + +// CHECK{LITERAL}: @dense_resource_multidim_tensor_constant = internal constant [1 x [2 x [2 x float]]] [[2 x [2 x float]] [[2 x float] [float 0x3FD6B46A80000000, float 0x3FD6781AC0000000], [2 x float] [float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000]]] +llvm.mlir.global internal constant @dense_resource_multidim_tensor_constant(dense_resource : tensor<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x !llvm.array<2 x f32>>> + +// CHECK{LITERAL}: @dense_resource_multidim_vector_constant = internal constant [1 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> , <2 x float> ]] +llvm.mlir.global internal constant @dense_resource_multidim_vector_constant(dense_resource : vector<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x vector<2 x f32>>> + +// Resources are kept at end of file. New tests should be added above this. +{-# + dialect_resources: { + builtin: { + dense_resource_test_5xf32: "0x08000000041A503E183382BEFCEEBABE7A3AF0BE0E9DEE3E", + dense_resource_test_2x2xf32: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E" + } + } +#-} \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 448aa3a5d85d79..961c9484446845 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -101,19 +101,6 @@ llvm.mlir.global internal @dense_float_vector_3d(dense<[[[1.0, 2.0], [3.0, 4.0]] // CHECK{LITERAL}: @splat_float_vector_3d = internal global [2 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> , <2 x float> ], [2 x <2 x float>] [<2 x float> , <2 x float> ]] llvm.mlir.global internal @splat_float_vector_3d(dense<42.0> : vector<2x2x2xf32>) : !llvm.array<2 x !llvm.array<2 x vector<2xf32>>> -// CHECK{LITERAL}: @dense_resource_tensor_constant = internal constant [5 x float] [float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000] -llvm.mlir.global internal constant @dense_resource_tensor_constant(dense_resource : tensor<5xf32>) : !llvm.array<5 x f32> - -// CHECK{LITERAL}: @dense_resource_vector_constant = internal constant <5 x float> -llvm.mlir.global internal constant @dense_resource_vector_constant(dense_resource : vector<5xf32>) : vector<5xf32> - - -// CHECK{LITERAL}: @dense_resource_multidim_tensor_constant = internal constant [1 x [2 x [2 x float]]] [[2 x [2 x float]] [[2 x float] [float 0x3FD6B46A80000000, float 0x3FD6781AC0000000], [2 x float] [float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000]]] -llvm.mlir.global internal constant @dense_resource_multidim_tensor_constant(dense_resource : tensor<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x !llvm.array<2 x f32>>> - -// CHECK{LITERAL}: @dense_resource_multidim_vector_constant = internal constant [1 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> , <2 x float> ]] -llvm.mlir.global internal constant @dense_resource_multidim_vector_constant(dense_resource : vector<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x vector<2 x f32>>> - // // Linkage attribute. // @@ -1590,16 +1577,6 @@ llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personali llvm.invoke %9(%6, %0) to ^bb2 unwind ^bb1 vararg(!llvm.func) : !llvm.ptr, (!llvm.ptr, i32) -> () } -// Resources are kept at end of file. New tests should be added above this. -{-# - dialect_resources: { - builtin: { - dense_resource_test_5xf32: "0x08000000041A503E183382BEFCEEBABE7A3AF0BE0E9DEE3E", - dense_resource_test_2x2xf32: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E" - } - } -#-} - // ----- llvm.func @foo() -> i8 diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 3c9c70711ae230..26123300d74888 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -248,53 +248,53 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v // ---- Wave32 ----- // f16 -> f32 - // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}}) %r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32> // bf16 -> f32 - // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}}) %r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32> // f16 -> f16 (OPSEL = {0,1}) - // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}}) + // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}}) %r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16> // bf16 -> bf16 (OPSEL = {0,1}) - // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}}) + // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}}) %r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16> // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) %r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32> // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}}) %r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32> // ---- Wave64 ----- // f16 -> f32 - // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}}) %r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32> // bf16 -> f32 - // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}}) %r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32> // f16 -> f16 (OPSEL = {0,1}) - // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}}) %r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16> // bf16 -> bf16 (OPSEL = {0,1}) - // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}}) + // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}}) %r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16> // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) %r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32> // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1}) - // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) + // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}) %r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32> llvm.return %r0 : vector<8xf32> diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index 71326049af0579..7f748cfbd31ad4 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -3058,7 +3058,7 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder( body << llvm::formatv( "static_cast(std::accumulate({0}.begin(), {0}.end(), 0, " "[](int32_t curSum, ::mlir::ValueRange range) {{ return curSum + " - "range.size(); }))", + "static_cast(range.size()); }))", operandName); } else { body << "static_cast(" << getArgumentName(op, i) << ".size())"; diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake index 201aeabbd3df9c..9387d9b3b0ff75 100644 --- a/openmp/cmake/HandleOpenMPOptions.cmake +++ b/openmp/cmake/HandleOpenMPOptions.cmake @@ -9,6 +9,14 @@ if (NOT COMMAND append_if) endfunction() endif() +if (NOT COMMAND append) + function(append value) + foreach(variable ${ARGN}) + set(${variable} "${${variable}} ${value}" PARENT_SCOPE) + endforeach(variable) + endfunction() +endif() + # MSVC and clang-cl in compatibility mode map -Wall to -Weverything. # TODO: LLVM adds /W4 instead, check if that works for the OpenMP runtimes. if (NOT MSVC) @@ -38,7 +46,11 @@ append_if(OPENMP_HAVE_WEXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_WPEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) -append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +if (NOT (WIN32 OR CYGWIN)) + # This flag is not relevant on Windows; the flag is accepted, but produces warnings + # about argument unused during compilation. + append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +endif() append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst index 3eeaf5c900d800..a5b39f61b0b64c 100644 --- a/openmp/docs/ReleaseNotes.rst +++ b/openmp/docs/ReleaseNotes.rst @@ -19,3 +19,5 @@ from the `LLVM releases web site `_. Non-comprehensive list of changes in this release ================================================= + +* SystemZ support added. diff --git a/openmp/libomptarget/include/Shared/SourceInfo.h b/openmp/libomptarget/include/Shared/SourceInfo.h index 7ce5fd43efc07f..711f06a04d017f 100644 --- a/openmp/libomptarget/include/Shared/SourceInfo.h +++ b/openmp/libomptarget/include/Shared/SourceInfo.h @@ -13,6 +13,7 @@ #ifndef OMPTARGET_SHARED_SOURCE_INFO_H #define OMPTARGET_SHARED_SOURCE_INFO_H +#include #include #ifdef _WIN32 diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index c287a31e0b1b54..46ee4c9fba7109 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -818,6 +818,7 @@ class KMPAffinity { typedef KMPAffinity::Mask kmp_affin_mask_t; extern KMPAffinity *__kmp_affinity_dispatch; +#ifndef KMP_OS_AIX class kmp_affinity_raii_t { kmp_affin_mask_t *mask; bool restored; @@ -842,6 +843,7 @@ class kmp_affinity_raii_t { } ~kmp_affinity_raii_t() { restore(); } }; +#endif // !KMP_OS_AIX // Declare local char buffers with this size for printing debug and info // messages, using __kmp_affinity_print_mask(). @@ -1181,7 +1183,11 @@ extern void __kmp_init_target_task(); #define KMP_MIN_STKSIZE ((size_t)(32 * 1024)) #endif +#if KMP_OS_AIX && KMP_ARCH_PPC +#define KMP_MAX_STKSIZE 0x10000000 /* 256Mb max size on 32-bit AIX */ +#else #define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1))) +#endif #if KMP_ARCH_X86 #define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024)) @@ -2494,14 +2500,15 @@ typedef struct kmp_dephash_entry kmp_dephash_entry_t; #define KMP_DEP_MTX 0x4 #define KMP_DEP_SET 0x8 #define KMP_DEP_ALL 0x80 -// Compiler sends us this info: +// Compiler sends us this info. Note: some test cases contain an explicit copy +// of this struct and should be in sync with any changes here. typedef struct kmp_depend_info { kmp_intptr_t base_addr; size_t len; union { kmp_uint8 flag; // flag as an unsigned char struct { // flag as a set of 8 bits -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) /* Same fields as in the #else branch, but in reverse order */ unsigned all : 1; unsigned unused : 3; @@ -2666,7 +2673,7 @@ typedef struct kmp_task_stack { #endif // BUILD_TIED_TASK_STACK typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */ -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) /* Same fields as in the #else branch, but in reverse order */ #if OMPX_TASKGRAPH unsigned reserved31 : 6; @@ -3906,7 +3913,7 @@ extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size); #if KMP_WEIGHTED_ITERATIONS_SUPPORTED extern int __kmp_get_first_osid_with_ecore(void); #endif -#if KMP_OS_LINUX || KMP_OS_FREEBSD +#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX extern int kmp_set_thread_affinity_mask_initial(void); #endif static inline void __kmp_assign_root_init_mask() { diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp index 6a41d34b023729..1ac541fbcaa707 100644 --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -2906,12 +2906,17 @@ static inline const char *__kmp_cpuinfo_get_envvar() { } // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the -// affinity map. +// affinity map. On AIX, the map is obtained through system SRAD (Scheduler +// Resource Allocation Domain). static bool __kmp_affinity_create_cpuinfo_map(int *line, kmp_i18n_id_t *const msg_id) { + *msg_id = kmp_i18n_null; + +#if KMP_OS_AIX + unsigned num_records = __kmp_xproc; +#else const char *filename = __kmp_cpuinfo_get_filename(); const char *envvar = __kmp_cpuinfo_get_envvar(); - *msg_id = kmp_i18n_null; if (__kmp_affinity.flags.verbose) { KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); @@ -2970,6 +2975,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, *msg_id = kmp_i18n_str_CantRewindCpuinfo; return false; } +#endif // KMP_OS_AIX // Allocate the array of records to store the proc info in. The dummy // element at the end makes the logic in filling them out easier to code. @@ -2999,6 +3005,99 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, INIT_PROC_INFO(threadInfo[i]); } +#if KMP_OS_AIX + int smt_threads; + lpar_info_format1_t cpuinfo; + unsigned num_avail = __kmp_xproc; + + if (__kmp_affinity.flags.verbose) + KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology"); + + // Get the number of SMT threads per core. + int retval = + lpar_get_info(LPAR_INFO_FORMAT1, &cpuinfo, sizeof(lpar_info_format1_t)); + if (!retval) + smt_threads = cpuinfo.smt_threads; + else { + CLEANUP_THREAD_INFO; + *msg_id = kmp_i18n_str_UnknownTopology; + return false; + } + + // Allocate a resource set containing available system resourses. + rsethandle_t sys_rset = rs_alloc(RS_SYSTEM); + if (sys_rset == NULL) { + CLEANUP_THREAD_INFO; + *msg_id = kmp_i18n_str_UnknownTopology; + return false; + } + // Allocate a resource set for the SRAD info. + rsethandle_t srad = rs_alloc(RS_EMPTY); + if (srad == NULL) { + rs_free(sys_rset); + CLEANUP_THREAD_INFO; + *msg_id = kmp_i18n_str_UnknownTopology; + return false; + } + + // Get the SRAD system detail level. + int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0); + if (sradsdl < 0) { + rs_free(sys_rset); + rs_free(srad); + CLEANUP_THREAD_INFO; + *msg_id = kmp_i18n_str_UnknownTopology; + return false; + } + // Get the number of RADs at that SRAD SDL. + int num_rads = rs_numrads(sys_rset, sradsdl, 0); + if (num_rads < 0) { + rs_free(sys_rset); + rs_free(srad); + CLEANUP_THREAD_INFO; + *msg_id = kmp_i18n_str_UnknownTopology; + return false; + } + + // Get the maximum number of procs that may be contained in a resource set. + int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0); + if (max_procs < 0) { + rs_free(sys_rset); + rs_free(srad); + CLEANUP_THREAD_INFO; + *msg_id = kmp_i18n_str_UnknownTopology; + return false; + } + + int cur_rad = 0; + int num_set = 0; + for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS; + ++srad_idx) { + // Check if the SRAD is available in the RSET. + if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0) + continue; + + for (int cpu = 0; cpu < max_procs; cpu++) { + // Set the info for the cpu if it is in the SRAD. + if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) { + threadInfo[cpu][osIdIndex] = cpu; + threadInfo[cpu][pkgIdIndex] = cur_rad; + threadInfo[cpu][coreIdIndex] = cpu / smt_threads; + ++num_set; + if (num_set >= num_avail) { + // Done if all available CPUs have been set. + break; + } + } + } + ++cur_rad; + } + rs_free(sys_rset); + rs_free(srad); + + // The topology is already sorted. + +#else // !KMP_OS_AIX unsigned num_avail = 0; *line = 0; #if KMP_ARCH_S390X @@ -3246,6 +3345,8 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, qsort(threadInfo, num_avail, sizeof(*threadInfo), __kmp_affinity_cmp_ProcCpuInfo_phys_id); +#endif // KMP_OS_AIX + // The table is now sorted by pkgId / coreId / threadId, but we really don't // know the radix of any of the fields. pkgId's may be sparsely assigned among // the chips on a system. Although coreId's are usually assigned @@ -4441,7 +4542,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) { } #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ -#if KMP_OS_LINUX +#if KMP_OS_LINUX || KMP_OS_AIX if (!success) { int line = 0; success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); @@ -4837,7 +4938,12 @@ void __kmp_affinity_uninitialize(void) { } if (__kmp_affin_origMask != NULL) { if (KMP_AFFINITY_CAPABLE()) { +#if KMP_OS_AIX + // Uninitialize by unbinding the thread. + bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY); +#else __kmp_set_system_affinity(__kmp_affin_origMask, FALSE); +#endif } KMP_CPU_FREE(__kmp_affin_origMask); __kmp_affin_origMask = NULL; @@ -5011,7 +5117,10 @@ void __kmp_affinity_bind_init_mask(int gtid) { __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); } else #endif +#ifndef KMP_OS_AIX + // Do not set the full mask as the init mask on AIX. __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); +#endif } void __kmp_affinity_bind_place(int gtid) { @@ -5124,7 +5233,7 @@ int __kmp_aux_set_affinity(void **mask) { int __kmp_aux_get_affinity(void **mask) { int gtid; int retval; -#if KMP_OS_WINDOWS || KMP_DEBUG +#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG kmp_info_t *th; #endif if (!KMP_AFFINITY_CAPABLE()) { @@ -5132,7 +5241,7 @@ int __kmp_aux_get_affinity(void **mask) { } gtid = __kmp_entry_gtid(); -#if KMP_OS_WINDOWS || KMP_DEBUG +#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG th = __kmp_threads[gtid]; #else (void)gtid; // unused variable @@ -5155,7 +5264,7 @@ int __kmp_aux_get_affinity(void **mask) { } } -#if !KMP_OS_WINDOWS +#if !KMP_OS_WINDOWS && !KMP_OS_AIX retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); KA_TRACE( @@ -5175,7 +5284,7 @@ int __kmp_aux_get_affinity(void **mask) { KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); return 0; -#endif /* KMP_OS_WINDOWS */ +#endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */ } int __kmp_aux_get_affinity_max_proc() { @@ -5557,7 +5666,7 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { } } -#if KMP_OS_LINUX || KMP_OS_FREEBSD +#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX // We don't need this entry for Windows because // there is GetProcessAffinityMask() api // @@ -5592,7 +5701,11 @@ extern "C" "set full mask for thread %d\n", gtid)); KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); +#if KMP_OS_AIX + return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY); +#else return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); +#endif } #endif diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h index 5464259784e2ba..1fb70491a9ede1 100644 --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -191,7 +191,7 @@ class KMPHwlocAffinity : public KMPAffinity { }; #endif /* KMP_USE_HWLOC */ -#if KMP_OS_LINUX || KMP_OS_FREEBSD +#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX #if KMP_OS_LINUX /* On some of the older OS's that we build on, these constants aren't present in #included from . They must be the same on @@ -314,6 +314,10 @@ class KMPHwlocAffinity : public KMPAffinity { #elif KMP_OS_FREEBSD #include #include +#elif KMP_OS_AIX +#include +#include +#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX. #endif class KMPNativeAffinity : public KMPAffinity { class Mask : public KMPAffinity::Mask { @@ -401,6 +405,70 @@ class KMPNativeAffinity : public KMPAffinity { ++retval; return retval; } +#if KMP_OS_AIX + // On AIX, we don't have a way to get CPU(s) a thread is bound to. + // This routine is only used to get the full mask. + int get_system_affinity(bool abort_on_error) override { + KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), + "Illegal get affinity operation when not capable"); + + (void)abort_on_error; + + // Set the mask with all CPUs that are available. + for (int i = 0; i < __kmp_xproc; ++i) + KMP_CPU_SET(i, this); + return 0; + } + int set_system_affinity(bool abort_on_error) const override { + KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), + + "Illegal set affinity operation when not capable"); + + int location; + int gtid = __kmp_entry_gtid(); + int tid = thread_self(); + + // Unbind the thread if it was bound to any processors before so that + // we can bind the thread to CPUs specified by the mask not others. + int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY); + + // On AIX, we can only bind to one instead of a set of CPUs with the + // bindprocessor() system call. + KMP_CPU_SET_ITERATE(location, this) { + if (KMP_CPU_ISSET(location, this)) { + retval = bindprocessor(BINDTHREAD, tid, location); + if (retval == -1 && errno == 1) { + rsid_t rsid; + rsethandle_t rsh; + // Put something in rsh to prevent compiler warning + // about uninitalized use + rsh = rs_alloc(RS_EMPTY); + rsid.at_pid = getpid(); + if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) { + retval = ra_detachrset(R_PROCESS, rsid, 0); + retval = bindprocessor(BINDTHREAD, tid, location); + } + } + if (retval == 0) { + KA_TRACE(10, ("__kmp_set_system_affinity: Done binding " + "T#%d to cpu=%d.\n", + gtid, location)); + continue; + } + int error = errno; + if (abort_on_error) { + __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"), + KMP_ERR(error), __kmp_msg_null); + KA_TRACE(10, ("__kmp_set_system_affinity: Error binding " + "T#%d to cpu=%d, errno=%d.\n", + gtid, location, error)); + return error; + } + } + } + return 0; + } +#else // !KMP_OS_AIX int get_system_affinity(bool abort_on_error) override { KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal get affinity operation when not capable"); @@ -443,6 +511,7 @@ class KMPNativeAffinity : public KMPAffinity { } return error; } +#endif // KMP_OS_AIX }; void determine_capable(const char *env_var) override { __kmp_affinity_determine_capable(env_var); @@ -471,7 +540,7 @@ class KMPNativeAffinity : public KMPAffinity { } api_type get_api_type() const override { return NATIVE_OS; } }; -#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */ +#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX */ #if KMP_OS_WINDOWS class KMPNativeAffinity : public KMPAffinity { diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index 9eeaeb88fb9ec7..878e78b5c7ad2d 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -1533,8 +1533,9 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint); if (*lk == 0) { if (KMP_IS_D_LOCK(lockseq)) { - KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, - KMP_GET_D_TAG(lockseq)); + KMP_COMPARE_AND_STORE_ACQ32( + (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0, + KMP_GET_D_TAG(lockseq)); } else { __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq)); } diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp index 88189659a23416..4dc8a90f83b4ea 100644 --- a/openmp/runtime/src/kmp_gsupport.cpp +++ b/openmp/runtime/src/kmp_gsupport.cpp @@ -144,7 +144,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) { // Mutual exclusion -// The symbol that icc/ifort generates for unnamed for unnamed critical sections +// The symbol that icc/ifort generates for unnamed critical sections // - .gomp_critical_user_ - is defined using .comm in any objects reference it. // We can't reference it directly here in C code, as the symbol contains a ".". // diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp index 85c54f4cdc7e96..0ad14f862bcb9b 100644 --- a/openmp/runtime/src/kmp_lock.cpp +++ b/openmp/runtime/src/kmp_lock.cpp @@ -2689,7 +2689,7 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) { // lock word. static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck, kmp_dyna_lockseq_t seq) { - TCW_4(*lck, KMP_GET_D_TAG(seq)); + TCW_4(((kmp_base_tas_lock_t *)lck)->poll, KMP_GET_D_TAG(seq)); KA_TRACE( 20, ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq)); @@ -3180,8 +3180,8 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock, lck->type = tag; if (OMP_LOCK_T_SIZE < sizeof(void *)) { - *((kmp_lock_index_t *)user_lock) = idx - << 1; // indirect lock word must be even + *(kmp_lock_index_t *)&(((kmp_base_tas_lock_t *)user_lock)->poll) = + idx << 1; // indirect lock word must be even } else { *((kmp_indirect_lock_t **)user_lock) = lck; } diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h index f21179b4eb68a1..6202f3d617cc59 100644 --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -50,7 +50,7 @@ typedef struct ident ident_t; // recent versions), but we are bounded by the pointer-sized chunks that // the Intel compiler allocates. -#if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT) +#if (KMP_OS_LINUX || KMP_OS_AIX) && defined(KMP_GOMP_COMPAT) #define OMP_LOCK_T_SIZE sizeof(int) #define OMP_NEST_LOCK_T_SIZE sizeof(void *) #else @@ -120,8 +120,16 @@ extern void __kmp_validate_locks(void); struct kmp_base_tas_lock { // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) && \ + __LP64__ + // Flip the ordering of the high and low 32-bit member to be consistent + // with the memory layout of the address in 64-bit big-endian. + kmp_int32 depth_locked; // depth locked, for nested locks only + std::atomic poll; +#else std::atomic poll; kmp_int32 depth_locked; // depth locked, for nested locks only +#endif }; typedef struct kmp_base_tas_lock kmp_base_tas_lock_t; @@ -1138,11 +1146,13 @@ extern int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32); // Extracts direct lock tag from a user lock pointer #define KMP_EXTRACT_D_TAG(l) \ - (*((kmp_dyna_lock_t *)(l)) & ((1 << KMP_LOCK_SHIFT) - 1) & \ - -(*((kmp_dyna_lock_t *)(l)) & 1)) + ((kmp_dyna_lock_t)((kmp_base_tas_lock_t *)(l))->poll & \ + ((1 << KMP_LOCK_SHIFT) - 1) & \ + -((kmp_dyna_lock_t)((kmp_tas_lock_t *)(l))->lk.poll & 1)) // Extracts indirect lock index from a user lock pointer -#define KMP_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1) +#define KMP_EXTRACT_I_INDEX(l) \ + ((kmp_lock_index_t)((kmp_base_tas_lock_t *)(l))->poll >> 1) // Returns function pointer to the direct lock function with l (kmp_dyna_lock_t // *) and op (operation type). diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index a0552dd930a62a..9cd0aefaea2dc0 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -75,7 +75,8 @@ #error Unknown compiler #endif -#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_WASI +#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD || KMP_OS_AIX) && \ + !KMP_OS_WASI #define KMP_AFFINITY_SUPPORTED 1 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64 #define KMP_GROUP_AFFINITY 1 diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index d2157b10b7819a..ec86ee07472c1e 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -255,8 +255,13 @@ static void __kmp_stg_parse_bool(char const *name, char const *value, // placed here in order to use __kmp_round4k static function void __kmp_check_stksize(size_t *val) { // if system stack size is too big then limit the size for worker threads +#if KMP_OS_AIX + if (*val > KMP_DEFAULT_STKSIZE * 2) // Use 2 times, 16 is too large for AIX. + *val = KMP_DEFAULT_STKSIZE * 2; +#else if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics... *val = KMP_DEFAULT_STKSIZE * 16; +#endif if (*val < __kmp_sys_min_stksize) *val = __kmp_sys_min_stksize; if (*val > KMP_MAX_STKSIZE) diff --git a/openmp/runtime/src/z_AIX_asm.S b/openmp/runtime/src/z_AIX_asm.S new file mode 100644 index 00000000000000..d711fcb7a7854f --- /dev/null +++ b/openmp/runtime/src/z_AIX_asm.S @@ -0,0 +1,410 @@ +// z_AIX_asm.S: - microtasking routines specifically +// written for Power platforms running AIX OS + +// +////===----------------------------------------------------------------------===// +//// +//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +//// See https://llvm.org/LICENSE.txt for license information. +//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//// +////===----------------------------------------------------------------------===// +// + +// ----------------------------------------------------------------------- +// macros +// ----------------------------------------------------------------------- + +#include "kmp_config.h" + +#if KMP_OS_AIX +//------------------------------------------------------------------------ +// int +// __kmp_invoke_microtask( void (*pkfn) (int *gtid, int *tid, ...), +// int gtid, int tid, +// int argc, void *p_argv[] +// #if OMPT_SUPPORT +// , +// void **exit_frame_ptr +// #endif +// ) { +// #if OMPT_SUPPORT +// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0); +// #endif +// +// (*pkfn)( & gtid, & tid, p_argv[0], ... ); +// +// // FIXME: This is done at call-site and can be removed here. +// #if OMPT_SUPPORT +// *exit_frame_ptr = 0; +// #endif +// +// return 1; +// } +// +// parameters: +// r3: pkfn +// r4: gtid +// r5: tid +// r6: argc +// r7: p_argv +// r8: &exit_frame +// +// return: r3 (always 1/TRUE) +// + +#if KMP_ARCH_PPC64_XCOFF + + .globl __kmp_invoke_microtask[DS] + .globl .__kmp_invoke_microtask + .align 4 + .csect __kmp_invoke_microtask[DS],3 + .vbyte 8, .__kmp_invoke_microtask + .vbyte 8, TOC[TC0] + .vbyte 8, 0 + .csect .text[PR],2 + .machine "pwr7" +.__kmp_invoke_microtask: + + +// -- Begin __kmp_invoke_microtask +// mark_begin; + +// We need to allocate a stack frame large enough to hold all of the parameters +// on the stack for the microtask plus what this function needs. That's 48 +// bytes under the XCOFF64 ABI, plus max(64, 8*(2 + argc)) for +// the parameters to the microtask (gtid, tid, argc elements of p_argv), +// plus 8 bytes to store the values of r4 and r5, and 8 bytes to store r31. +// With OMP-T support, we need an additional 8 bytes to save r30 to hold +// a copy of r8. +// Stack offsets relative to stack pointer: +// r31: -8, r30: -16, gtid: -20, tid: -24 + + mflr 0 + std 31, -8(1) # Save r31 to the stack + std 0, 16(1) # Save LR to the linkage area + +// This is unusual because normally we'd set r31 equal to r1 after the stack +// frame is established. In this case, however, we need to dynamically compute +// the stack frame size, and so we keep a direct copy of r1 to access our +// register save areas and restore the r1 value before returning. + mr 31, 1 + +// Compute the size of the "argc" portion of the parameter save area. +// The parameter save area is always at least 64 bytes long (i.e. 8 regs) +// The microtask has (2 + argc) parameters, so if argc <= 6, we need to +// to allocate 8*6 bytes, not 8*argc. + li 0, 6 + cmpwi 0, 6, 6 + iselgt 0, 6, 0 # r0 = (argc > 6)? argc : 6 + sldi 0, 0, 3 # r0 = 8 * max(argc, 6) + +// Compute the size necessary for the local stack frame. +// 88 = 48 + 4 (for r4) + 4 (for r5) + 8 (for r31) + 8 (for OMP-T r30) + +// 8 (parameter gtid) + 8 (parameter tid) + li 12, 88 + add 12, 0, 12 + neg 12, 12 + +// We need to make sure that the stack frame stays aligned (to 16 bytes). + li 0, -16 + and 12, 0, 12 + +// Establish the local stack frame. + stdux 1, 1, 12 + +#if OMPT_SUPPORT + std 30, -16(31) # Save r30 to the stack + std 1, 0(8) + mr 30, 8 +#endif + +// Store gtid and tid to the stack because they're passed by reference to the microtask. + stw 4, -20(31) # Save gtid to the stack + stw 5, -24(31) # Save tid to the stack + + mr 12, 6 # r12 = argc + mr 4, 7 # r4 = p_argv + + cmpwi 0, 12, 1 + blt 0, .Lcall # if (argc < 1) goto .Lcall + + ld 5, 0(4) # r5 = p_argv[0] + + cmpwi 0, 12, 2 + blt 0, .Lcall # if (argc < 2) goto .Lcall + + ld 6, 8(4) # r6 = p_argv[1] + + cmpwi 0, 12, 3 + blt 0, .Lcall # if (argc < 3) goto .Lcall + + ld 7, 16(4) # r7 = p_argv[2] + + cmpwi 0, 12, 4 + blt 0, .Lcall # if (argc < 4) goto .Lcall + + ld 8, 24(4) # r8 = p_argv[3] + + cmpwi 0, 12, 5 + blt 0, .Lcall # if (argc < 5) goto .Lcall + + ld 9, 32(4) # r9 = p_argv[4] + + cmpwi 0, 12, 6 + blt 0, .Lcall # if (argc < 6) goto .Lcall + + ld 10, 40(4) # r10 = p_argv[5] + + cmpwi 0, 12, 7 + blt 0, .Lcall # if (argc < 7) goto .Lcall + +// There are more than 6 microtask parameters, so we need to store the +// remainder to the stack. + addi 12, 12, -6 # argc -= 6 + mtctr 12 + +// These are set to 8 bytes before the first desired store address (we're using +// pre-increment loads and stores in the loop below). The parameter save area +// for the microtask begins 48 + 8*8 == 112 bytes above r1 for XCOFF64. + addi 4, 4, 40 # p_argv = p_argv + 5 + # (i.e. skip the 5 elements we already processed) + addi 12, 1, 104 # r12 = stack offset (112 - 8) + +.Lnext: + ldu 0, 8(4) + stdu 0, 8(12) + bdnz .Lnext + +.Lcall: + std 2, 40(1) # Save the TOC pointer to the linkage area +// Load the actual function address from the function descriptor. + ld 12, 0(3) # Function address + ld 2, 8(3) # TOC pointer + ld 11, 16(3) # Environment pointer + + addi 3, 31, -20 # r3 = >id + addi 4, 31, -24 # r4 = &tid + + mtctr 12 # CTR = function address + bctrl # Branch to CTR + ld 2, 40(1) # Restore TOC pointer from linkage area + +#if OMPT_SUPPORT + li 3, 0 + std 3, 0(30) +#endif + + li 3, 1 + +#if OMPT_SUPPORT + ld 30, -16(31) # Restore r30 from the saved value on the stack +#endif + + mr 1, 31 + ld 31, -8(1) # Restore r31 from the saved value on the stack + ld 0, 16(1) + mtlr 0 # Restore LR from the linkage area + blr # Branch to LR + +#else // KMP_ARCH_PPC_XCOFF + + .globl __kmp_invoke_microtask[DS] + .globl .__kmp_invoke_microtask + .align 4 + .csect __kmp_invoke_microtask[DS],2 + .vbyte 4, .__kmp_invoke_microtask + .vbyte 4, TOC[TC0] + .vbyte 4, 0 + .csect .text[PR],2 + .machine "pwr7" +.__kmp_invoke_microtask: + + +// -- Begin __kmp_invoke_microtask +// mark_begin; + +// We need to allocate a stack frame large enough to hold all of the parameters +// on the stack for the microtask plus what this function needs. That's 24 +// bytes under the XCOFF ABI, plus max(32, 8*(2 + argc)) for +// the parameters to the microtask (gtid, tid, argc elements of p_argv), +// plus 8 bytes to store the values of r4 and r5, and 4 bytes to store r31. +// With OMP-T support, we need an additional 4 bytes to save r30 to hold +// a copy of r8. +// Stack offsets relative to stack pointer: +// r31: -4, r30: -8, gtid: -12, tid: -16 + + mflr 0 + stw 31, -4(1) # Save r31 to the stack + stw 0, 8(1) # Save LR to the linkage area + +// This is unusual because normally we'd set r31 equal to r1 after the stack +// frame is established. In this case, however, we need to dynamically compute +// the stack frame size, and so we keep a direct copy of r1 to access our +// register save areas and restore the r1 value before returning. + mr 31, 1 + +// Compute the size of the "argc" portion of the parameter save area. +// The parameter save area is always at least 32 bytes long (i.e. 8 regs) +// The microtask has (2 + argc) parameters, so if argc <= 6, we need to +// to allocate 4*6 bytes, not 4*argc. + li 0, 6 + cmpwi 0, 6, 6 + iselgt 0, 6, 0 # r0 = (argc > 6)? argc : 6 + slwi 0, 0, 2 # r0 = 4 * max(argc, 6) + +// Compute the size necessary for the local stack frame. +// 56 = 32 + 4 (for r4) + 4 (for r5) + 4 (for r31) + 4 (for OMP-T r30) + +// 4 (parameter gtid) + 4 (parameter tid) + li 12, 56 + add 12, 0, 12 + neg 12, 12 + +// We need to make sure that the stack frame stays aligned (to 16 bytes). + li 0, -16 + and 12, 0, 12 + +// Establish the local stack frame. + stwux 1, 1, 12 + +#if OMPT_SUPPORT + stw 30, -8(31) # Save r30 to the stack + stw 1, 0(8) + mr 30, 8 +#endif + +// Store gtid and tid to the stack because they're passed by reference to the microtask. + stw 4, -12(31) # Save gtid to the stack + stw 5, -16(31) # Save tid to the stack + + mr 12, 6 # r12 = argc + mr 4, 7 # r4 = p_argv + + cmpwi 0, 12, 1 + blt 0, .Lcall # if (argc < 1) goto .Lcall + + lwz 5, 0(4) # r5 = p_argv[0] + + cmpwi 0, 12, 2 + blt 0, .Lcall # if (argc < 2) goto .Lcall + + lwz 6, 4(4) # r6 = p_argv[1] + + cmpwi 0, 12, 3 + blt 0, .Lcall # if (argc < 3) goto .Lcall + + lwz 7, 8(4) # r7 = p_argv[2] + + cmpwi 0, 12, 4 + blt 0, .Lcall # if (argc < 4) goto .Lcall + + lwz 8, 12(4) # r8 = p_argv[3] + + cmpwi 0, 12, 5 + blt 0, .Lcall # if (argc < 5) goto .Lcall + + lwz 9, 16(4) # r9 = p_argv[4] + + cmpwi 0, 12, 6 + blt 0, .Lcall # if (argc < 6) goto .Lcall + + lwz 10, 20(4) # r10 = p_argv[5] + + cmpwi 0, 12, 7 + blt 0, .Lcall # if (argc < 7) goto .Lcall + +// There are more than 6 microtask parameters, so we need to store the +// remainder to the stack. + addi 12, 12, -6 # argc -= 6 + mtctr 12 + +// These are set to 4 bytes before the first desired store address (we're using +// pre-increment loads and stores in the loop below). The parameter save area +// for the microtask begins 24 + 4*8 == 56 bytes above r1 for XCOFF. + addi 4, 4, 20 # p_argv = p_argv + 5 + # (i.e. skip the 5 elements we already processed) + addi 12, 1, 52 # r12 = stack offset (56 - 4) + +.Lnext: + lwzu 0, 4(4) + stwu 0, 4(12) + bdnz .Lnext + +.Lcall: + stw 2, 20(1) # Save the TOC pointer to the linkage area +// Load the actual function address from the function descriptor. + lwz 12, 0(3) # Function address + lwz 2, 4(3) # TOC pointer + lwz 11, 8(3) # Environment pointer + + addi 3, 31, -12 # r3 = >id + addi 4, 31, -16 # r4 = &tid + + mtctr 12 # CTR = function address + bctrl # Branch to CTR + lwz 2, 20(1) # Restore TOC pointer from linkage area + +#if OMPT_SUPPORT + li 3, 0 + stw 3, 0(30) +#endif + + li 3, 1 + +#if OMPT_SUPPORT + lwz 30, -8(31) # Restore r30 from the saved value on the stack +#endif + + mr 1, 31 + lwz 31, -4(1) # Restore r31 from the saved value on the stack + lwz 0, 8(1) + mtlr 0 # Restore LR from the linkage area + blr # Branch to LR + +#endif // KMP_ARCH_PPC64_XCOFF + +.Lfunc_end0: + .vbyte 4, 0x00000000 # Traceback table begin + .byte 0x00 # Version = 0 + .byte 0x09 # Language = CPlusPlus + .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue + # +HasTraceBackTableOffset, -IsInternalProcedure + # -HasControlledStorage, -IsTOCless + # -IsFloatingPointPresent + # -IsFloatingPointOperationLogOrAbortEnabled + .byte 0x61 # -IsInterruptHandler, +IsFunctionNamePresent, +IsAllocaUsed + # OnConditionDirective = 0, -IsCRSaved, +IsLRSaved + .byte 0x80 # +IsBackChainStored, -IsFixup, NumOfFPRsSaved = 0 +#if OMPT_SUPPORT + .byte 0x02 # -HasExtensionTable, -HasVectorInfo, NumOfGPRsSaved = 2 + .byte 0x06 # NumberOfFixedParms = 6 +#else + .byte 0x01 # -HasExtensionTable, -HasVectorInfo, NumOfGPRsSaved = 1 + .byte 0x05 # NumberOfFixedParms = 5 +#endif + .byte 0x01 # NumberOfFPParms = 0, +HasParmsOnStack + .vbyte 4, 0x00000000 # Parameter type = i, i, i, i, i + .vbyte 4, .Lfunc_end0-.__kmp_invoke_microtask # Function size + .vbyte 2, 0x0016 # Function name len = 22 + .byte "__kmp_invoke_microtask" # Function Name + .byte 0x1f # AllocaRegister = 31 + # -- End function + +// -- End __kmp_invoke_microtask + +// Support for unnamed common blocks. + + .comm .gomp_critical_user_, 32, 3 +#if KMP_ARCH_PPC64_XCOFF + .csect __kmp_unnamed_critical_addr[RW],3 +#else + .csect __kmp_unnamed_critical_addr[RW],2 +#endif + .globl __kmp_unnamed_critical_addr[RW] + .ptr .gomp_critical_user_ + +// -- End unnamed common block + + .toc + +#endif // KMP_OS_AIX diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 513ec6517d00bd..b9ff96873702b1 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -116,7 +116,7 @@ static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) { } #endif -#if ((KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED) +#if ((KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX) && KMP_AFFINITY_SUPPORTED) /* Affinity support */ @@ -132,6 +132,29 @@ void __kmp_affinity_bind_thread(int which) { KMP_CPU_FREE_FROM_STACK(mask); } +#if KMP_OS_AIX +void __kmp_affinity_determine_capable(const char *env_var) { + // All versions of AIX support bindprocessor(). + + size_t mask_size = __kmp_xproc / CHAR_BIT; + // Round up to byte boundary. + if (__kmp_xproc % CHAR_BIT) + ++mask_size; + + // Round up to the mask_size_type boundary. + if (mask_size % sizeof(__kmp_affin_mask_size)) + mask_size += sizeof(__kmp_affin_mask_size) - + mask_size % sizeof(__kmp_affin_mask_size); + KMP_AFFINITY_ENABLE(mask_size); + KA_TRACE(10, + ("__kmp_affinity_determine_capable: " + "AIX OS affinity interface bindprocessor functional (mask size = " + "%" KMP_SIZE_T_SPEC ").\n", + __kmp_affin_mask_size)); +} + +#else // !KMP_OS_AIX + /* Determine if we can access affinity functionality on this version of * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set * __kmp_affin_mask_size to the appropriate value (0 means not capable). */ @@ -259,8 +282,9 @@ void __kmp_affinity_determine_capable(const char *env_var) { KMP_WARNING(AffCantGetMaskSize, env_var); } } - -#endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED +#endif // KMP_OS_AIX +#endif // (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX) && + // KMP_AFFINITY_SUPPORTED #if KMP_USE_FUTEX @@ -476,7 +500,7 @@ static void *__kmp_launch_worker(void *thr) { #endif /* KMP_BLOCK_SIGNALS */ void *exit_val; #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS + KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX void *volatile padding = 0; #endif int gtid; @@ -525,7 +549,7 @@ static void *__kmp_launch_worker(void *thr) { #endif /* KMP_BLOCK_SIGNALS */ #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS + KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX if (__kmp_stkoffset > 0 && gtid > 0) { padding = KMP_ALLOCA(gtid * __kmp_stkoffset); (void)padding; @@ -1245,7 +1269,7 @@ static void __kmp_atfork_child(void) { ++__kmp_fork_count; #if KMP_AFFINITY_SUPPORTED -#if KMP_OS_LINUX || KMP_OS_FREEBSD +#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX // reset the affinity in the child to the initial thread // affinity in the parent kmp_set_thread_affinity_mask_initial(); @@ -2214,6 +2238,7 @@ int __kmp_is_address_mapped(void *addr) { found = (int)addr < (__builtin_wasm_memory_size(0) * PAGESIZE); #elif KMP_OS_DRAGONFLY || KMP_OS_SOLARIS || KMP_OS_AIX + (void)rc; // FIXME(DragonFly, Solaris, AIX): Implement this found = 1; diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg index 4a457f4cc41f75..2126928bb411a9 100644 --- a/openmp/runtime/test/lit.cfg +++ b/openmp/runtime/test/lit.cfg @@ -129,7 +129,7 @@ if config.operating_system == 'NetBSD': if config.operating_system == 'Darwin': config.available_features.add("darwin") -if config.operating_system in ['Linux', 'Windows']: +if config.operating_system in ['Linux', 'Windows', 'AIX']: config.available_features.add('affinity') if config.operating_system in ['Linux']: diff --git a/openmp/runtime/test/tasking/bug_nested_proxy_task.c b/openmp/runtime/test/tasking/bug_nested_proxy_task.c index 43502bdcd1abd1..9e0b412efce609 100644 --- a/openmp/runtime/test/tasking/bug_nested_proxy_task.c +++ b/openmp/runtime/test/tasking/bug_nested_proxy_task.c @@ -50,12 +50,21 @@ typedef struct kmp_depend_info { union { kmp_uint8 flag; // flag as an unsigned char struct { // flag as a set of 8 bits - unsigned in : 1; - unsigned out : 1; - unsigned mtx : 1; - unsigned set : 1; - unsigned unused : 3; - unsigned all : 1; +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + unsigned all : 1; + unsigned unused : 3; + unsigned set : 1; + unsigned mtx : 1; + unsigned out : 1; + unsigned in : 1; +#else + unsigned in : 1; + unsigned out : 1; + unsigned mtx : 1; + unsigned set : 1; + unsigned unused : 3; + unsigned all : 1; +#endif } flags; }; } kmp_depend_info_t; diff --git a/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c b/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c index ff75df51aff077..1e86d574f4f6a8 100644 --- a/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c +++ b/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c @@ -47,12 +47,21 @@ typedef struct kmp_depend_info { union { kmp_uint8 flag; // flag as an unsigned char struct { // flag as a set of 8 bits - unsigned in : 1; - unsigned out : 1; - unsigned mtx : 1; - unsigned set : 1; - unsigned unused : 3; - unsigned all : 1; +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + unsigned all : 1; + unsigned unused : 3; + unsigned set : 1; + unsigned mtx : 1; + unsigned out : 1; + unsigned in : 1; +#else + unsigned in : 1; + unsigned out : 1; + unsigned mtx : 1; + unsigned set : 1; + unsigned unused : 3; + unsigned all : 1; +#endif } flags; }; } kmp_depend_info_t; diff --git a/openmp/runtime/test/tasking/hidden_helper_task/common.h b/openmp/runtime/test/tasking/hidden_helper_task/common.h index 402ecf3ed553c9..68e2b584c87739 100644 --- a/openmp/runtime/test/tasking/hidden_helper_task/common.h +++ b/openmp/runtime/test/tasking/hidden_helper_task/common.h @@ -17,9 +17,21 @@ typedef struct kmp_depend_info { union { unsigned char flag; struct { - bool in : 1; - bool out : 1; - bool mtx : 1; +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + unsigned all : 1; + unsigned unused : 3; + unsigned set : 1; + unsigned mtx : 1; + unsigned out : 1; + unsigned in : 1; +#else + unsigned in : 1; + unsigned out : 1; + unsigned mtx : 1; + unsigned set : 1; + unsigned unused : 3; + unsigned all : 1; +#endif } flags; }; } kmp_depend_info_t; diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp index 69257c603877ea..d58dc9917bc91f 100644 --- a/polly/lib/Analysis/DependenceInfo.cpp +++ b/polly/lib/Analysis/DependenceInfo.cpp @@ -950,8 +950,8 @@ class DependenceInfoPrinterLegacyPass final : public ScopPass { bool runOnScop(Scop &S) override { DependenceInfo &P = getAnalysis(); - OS << "Printing analysis '" << P.getPassName() << "' for " << "region: '" - << S.getRegion().getNameStr() << "' in function '" + OS << "Printing analysis '" << P.getPassName() << "' for " + << "region: '" << S.getRegion().getNameStr() << "' in function '" << S.getFunction().getName() << "':\n"; P.printScop(OS, S); diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index c62cb2a85c835c..64314d6041b8e9 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -2689,9 +2689,10 @@ void ScopBuilder::addUserContext() { if (NameContext != NameUserContext) { std::string SpaceStr = stringFromIslObj(Space, "null"); errs() << "Error: the name of dimension " << i - << " provided in -polly-context " << "is '" << NameUserContext - << "', but the name in the computed " << "context is '" - << NameContext << "'. Due to this name mismatch, " + << " provided in -polly-context " + << "is '" << NameUserContext << "', but the name in the computed " + << "context is '" << NameContext + << "'. Due to this name mismatch, " << "the -polly-context option is ignored. Please provide " << "the context in the parameter space: " << SpaceStr << ".\n"; return; diff --git a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp index 364e21aef207ce..58c2b4fedc478d 100644 --- a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp +++ b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp @@ -45,7 +45,7 @@ using namespace llvm; #define DEBUG_TYPE "polly-detect" #define SCOP_STAT(NAME, DESC) \ - { "polly-detect", "NAME", "Number of rejected regions: " DESC } + {"polly-detect", "NAME", "Number of rejected regions: " DESC} static Statistic RejectStatistics[] = { SCOP_STAT(CFG, ""), diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp index 74d4e6c7993fa3..63fb06a634cc12 100644 --- a/polly/lib/Exchange/JSONExporter.cpp +++ b/polly/lib/Exchange/JSONExporter.cpp @@ -842,7 +842,7 @@ class JSONImporterPrinterLegacyPass final : public ScopPass { public: static char ID; - JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()){}; + JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()) {} explicit JSONImporterPrinterLegacyPass(llvm::raw_ostream &OS) : ScopPass(ID), OS(OS) {} diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp index 51e701346563a1..dae5e79639f7be 100644 --- a/polly/lib/Transform/DeLICM.cpp +++ b/polly/lib/Transform/DeLICM.cpp @@ -1463,7 +1463,7 @@ class DeLICMPrinterLegacyPass final : public ScopPass { public: static char ID; - DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()){}; + DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()) {} explicit DeLICMPrinterLegacyPass(llvm::raw_ostream &OS) : ScopPass(ID), OS(OS) {} diff --git a/polly/lib/Transform/FlattenSchedule.cpp b/polly/lib/Transform/FlattenSchedule.cpp index 53e230be7a6945..87bf642ba0d92b 100644 --- a/polly/lib/Transform/FlattenSchedule.cpp +++ b/polly/lib/Transform/FlattenSchedule.cpp @@ -103,7 +103,7 @@ class FlattenSchedulePrinterLegacyPass final : public ScopPass { static char ID; FlattenSchedulePrinterLegacyPass() - : FlattenSchedulePrinterLegacyPass(outs()){}; + : FlattenSchedulePrinterLegacyPass(outs()) {} explicit FlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS) : ScopPass(ID), OS(OS) {} diff --git a/polly/lib/Transform/ForwardOpTree.cpp b/polly/lib/Transform/ForwardOpTree.cpp index 2bed3e35412d76..5e6de2e182a526 100644 --- a/polly/lib/Transform/ForwardOpTree.cpp +++ b/polly/lib/Transform/ForwardOpTree.cpp @@ -1149,7 +1149,7 @@ class ForwardOpTreePrinterLegacyPass final : public ScopPass { public: static char ID; - ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()){}; + ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()) {} explicit ForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS) : ScopPass(ID), OS(OS) {}